4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2013 by Delphix. All rights reserved.
27 #include <sys/zfs_context.h>
29 #include <sys/vdev_impl.h>
31 #include <sys/zio_checksum.h>
32 #include <sys/fs/zfs.h>
33 #include <sys/fm/fs/zfs.h>
36 * Virtual device vector for RAID-Z.
38 * This vdev supports single, double, and triple parity. For single parity,
39 * we use a simple XOR of all the data columns. For double or triple parity,
40 * we use a special case of Reed-Solomon coding. This extends the
41 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
42 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
43 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
44 * former is also based. The latter is designed to provide higher performance
47 * Note that the Plank paper claimed to support arbitrary N+M, but was then
48 * amended six years later identifying a critical flaw that invalidates its
49 * claims. Nevertheless, the technique can be adapted to work for up to
50 * triple parity. For additional parity, the amendment "Note: Correction to
51 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
52 * is viable, but the additional complexity means that write performance will
55 * All of the methods above operate on a Galois field, defined over the
56 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
57 * can be expressed with a single byte. Briefly, the operations on the
58 * field are defined as follows:
60 * o addition (+) is represented by a bitwise XOR
61 * o subtraction (-) is therefore identical to addition: A + B = A - B
62 * o multiplication of A by 2 is defined by the following bitwise expression:
66 * (A * 2)_4 = A_3 + A_7
67 * (A * 2)_3 = A_2 + A_7
68 * (A * 2)_2 = A_1 + A_7
72 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
73 * As an aside, this multiplication is derived from the error correcting
74 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
76 * Observe that any number in the field (except for 0) can be expressed as a
77 * power of 2 -- a generator for the field. We store a table of the powers of
78 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
79 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
80 * than field addition). The inverse of a field element A (A^-1) is therefore
81 * A ^ (255 - 1) = A^254.
83 * The up-to-three parity columns, P, Q, R over several data columns,
84 * D_0, ... D_n-1, can be expressed by field operations:
86 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
87 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
88 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
89 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
90 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
92 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
93 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
94 * independent coefficients. (There are no additional coefficients that have
95 * this property which is why the uncorrected Plank method breaks down.)
97 * See the reconstruction code below for how P, Q and R can used individually
98 * or in concert to recover missing data columns.
101 typedef struct raidz_col {
102 uint64_t rc_devidx; /* child device index for I/O */
103 uint64_t rc_offset; /* device offset */
104 uint64_t rc_size; /* I/O size */
105 void *rc_data; /* I/O data */
106 void *rc_gdata; /* used to store the "good" version */
107 int rc_error; /* I/O error for this device */
108 uint8_t rc_tried; /* Did we attempt this I/O column? */
109 uint8_t rc_skipped; /* Did we skip this I/O column? */
112 typedef struct raidz_map {
113 uint64_t rm_cols; /* Regular column count */
114 uint64_t rm_scols; /* Count including skipped columns */
115 uint64_t rm_bigcols; /* Number of oversized columns */
116 uint64_t rm_asize; /* Actual total I/O size */
117 uint64_t rm_missingdata; /* Count of missing data devices */
118 uint64_t rm_missingparity; /* Count of missing parity devices */
119 uint64_t rm_firstdatacol; /* First data column/parity count */
120 uint64_t rm_nskip; /* Skipped sectors for padding */
121 uint64_t rm_skipstart; /* Column index of padding start */
122 void *rm_datacopy; /* rm_asize-buffer of copied data */
123 uintptr_t rm_reports; /* # of referencing checksum reports */
124 uint8_t rm_freed; /* map no longer has referencing ZIO */
125 uint8_t rm_ecksuminjected; /* checksum error was injected */
126 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
129 #define VDEV_RAIDZ_P 0
130 #define VDEV_RAIDZ_Q 1
131 #define VDEV_RAIDZ_R 2
133 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
134 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
137 * We provide a mechanism to perform the field multiplication operation on a
138 * 64-bit value all at once rather than a byte at a time. This works by
139 * creating a mask from the top bit in each byte and using that to
140 * conditionally apply the XOR of 0x1d.
142 #define VDEV_RAIDZ_64MUL_2(x, mask) \
144 (mask) = (x) & 0x8080808080808080ULL; \
145 (mask) = ((mask) << 1) - ((mask) >> 7); \
146 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
147 ((mask) & 0x1d1d1d1d1d1d1d1d); \
150 #define VDEV_RAIDZ_64MUL_4(x, mask) \
152 VDEV_RAIDZ_64MUL_2((x), mask); \
153 VDEV_RAIDZ_64MUL_2((x), mask); \
157 * Force reconstruction to use the general purpose method.
159 int vdev_raidz_default_to_general;
162 * These two tables represent powers and logs of 2 in the Galois field defined
163 * above. These values were computed by repeatedly multiplying by 2 as above.
165 static const uint8_t vdev_raidz_pow2[256] = {
166 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
167 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
168 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
169 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
170 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
171 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
172 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
173 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
174 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
175 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
176 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
177 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
178 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
179 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
180 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
181 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
182 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
183 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
184 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
185 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
186 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
187 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
188 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
189 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
190 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
191 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
192 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
193 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
194 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
195 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
196 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
197 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
199 static const uint8_t vdev_raidz_log2[256] = {
200 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
201 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
202 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
203 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
204 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
205 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
206 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
207 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
208 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
209 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
210 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
211 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
212 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
213 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
214 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
215 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
216 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
217 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
218 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
219 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
220 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
221 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
222 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
223 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
224 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
225 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
226 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
227 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
228 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
229 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
230 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
231 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
234 static void vdev_raidz_generate_parity(raidz_map_t *rm);
237 * Multiply a given number by 2 raised to the given power.
240 vdev_raidz_exp2(uint_t a, int exp)
246 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
248 exp += vdev_raidz_log2[a];
252 return (vdev_raidz_pow2[exp]);
256 vdev_raidz_map_free(raidz_map_t *rm)
261 for (c = 0; c < rm->rm_firstdatacol; c++) {
262 if (rm->rm_col[c].rc_data != NULL)
263 zio_buf_free(rm->rm_col[c].rc_data,
264 rm->rm_col[c].rc_size);
266 if (rm->rm_col[c].rc_gdata != NULL)
267 zio_buf_free(rm->rm_col[c].rc_gdata,
268 rm->rm_col[c].rc_size);
272 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
273 size += rm->rm_col[c].rc_size;
275 if (rm->rm_datacopy != NULL)
276 zio_buf_free(rm->rm_datacopy, size);
278 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
282 vdev_raidz_map_free_vsd(zio_t *zio)
284 raidz_map_t *rm = zio->io_vsd;
286 ASSERT0(rm->rm_freed);
289 if (rm->rm_reports == 0)
290 vdev_raidz_map_free(rm);
295 vdev_raidz_cksum_free(void *arg, size_t ignored)
297 raidz_map_t *rm = arg;
299 ASSERT3U(rm->rm_reports, >, 0);
301 if (--rm->rm_reports == 0 && rm->rm_freed != 0)
302 vdev_raidz_map_free(rm);
306 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
308 raidz_map_t *rm = zcr->zcr_cbdata;
309 size_t c = zcr->zcr_cbinfo;
312 const char *good = NULL;
313 const char *bad = rm->rm_col[c].rc_data;
315 if (good_data == NULL) {
316 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
320 if (c < rm->rm_firstdatacol) {
322 * The first time through, calculate the parity blocks for
323 * the good data (this relies on the fact that the good
324 * data never changes for a given logical ZIO)
326 if (rm->rm_col[0].rc_gdata == NULL) {
327 char *bad_parity[VDEV_RAIDZ_MAXPARITY];
331 * Set up the rm_col[]s to generate the parity for
332 * good_data, first saving the parity bufs and
333 * replacing them with buffers to hold the result.
335 for (x = 0; x < rm->rm_firstdatacol; x++) {
336 bad_parity[x] = rm->rm_col[x].rc_data;
337 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
338 zio_buf_alloc(rm->rm_col[x].rc_size);
341 /* fill in the data columns from good_data */
342 buf = (char *)good_data;
343 for (; x < rm->rm_cols; x++) {
344 rm->rm_col[x].rc_data = buf;
345 buf += rm->rm_col[x].rc_size;
349 * Construct the parity from the good data.
351 vdev_raidz_generate_parity(rm);
353 /* restore everything back to its original state */
354 for (x = 0; x < rm->rm_firstdatacol; x++)
355 rm->rm_col[x].rc_data = bad_parity[x];
357 buf = rm->rm_datacopy;
358 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
359 rm->rm_col[x].rc_data = buf;
360 buf += rm->rm_col[x].rc_size;
364 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
365 good = rm->rm_col[c].rc_gdata;
367 /* adjust good_data to point at the start of our column */
370 for (x = rm->rm_firstdatacol; x < c; x++)
371 good += rm->rm_col[x].rc_size;
374 /* we drop the ereport if it ends up that the data was good */
375 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
379 * Invoked indirectly by zfs_ereport_start_checksum(), called
380 * below when our read operation fails completely. The main point
381 * is to keep a copy of everything we read from disk, so that at
382 * vdev_raidz_cksum_finish() time we can compare it with the good data.
385 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
387 size_t c = (size_t)(uintptr_t)arg;
390 raidz_map_t *rm = zio->io_vsd;
393 /* set up the report and bump the refcount */
394 zcr->zcr_cbdata = rm;
396 zcr->zcr_finish = vdev_raidz_cksum_finish;
397 zcr->zcr_free = vdev_raidz_cksum_free;
400 ASSERT3U(rm->rm_reports, >, 0);
402 if (rm->rm_datacopy != NULL)
406 * It's the first time we're called for this raidz_map_t, so we need
407 * to copy the data aside; there's no guarantee that our zio's buffer
408 * won't be re-used for something else.
410 * Our parity data is already in separate buffers, so there's no need
415 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
416 size += rm->rm_col[c].rc_size;
418 buf = rm->rm_datacopy = zio_buf_alloc(size);
420 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
421 raidz_col_t *col = &rm->rm_col[c];
423 bcopy(col->rc_data, buf, col->rc_size);
428 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
431 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
432 vdev_raidz_map_free_vsd,
433 vdev_raidz_cksum_report
437 * Divides the IO evenly across all child vdevs; usually, dcols is
438 * the number of children in the target vdev.
441 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
445 /* The starting RAIDZ (parent) vdev sector of the block. */
446 uint64_t b = zio->io_offset >> unit_shift;
447 /* The zio's size in units of the vdev's minimum sector size. */
448 uint64_t s = zio->io_size >> unit_shift;
449 /* The first column for this stripe. */
450 uint64_t f = b % dcols;
451 /* The starting byte offset on each child vdev. */
452 uint64_t o = (b / dcols) << unit_shift;
453 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
456 * "Quotient": The number of data sectors for this stripe on all but
457 * the "big column" child vdevs that also contain "remainder" data.
459 q = s / (dcols - nparity);
462 * "Remainder": The number of partial stripe data sectors in this I/O.
463 * This will add a sector to some, but not all, child vdevs.
465 r = s - q * (dcols - nparity);
467 /* The number of "big columns" - those which contain remainder data. */
468 bc = (r == 0 ? 0 : r + nparity);
471 * The total number of data and parity sectors associated with
474 tot = s + nparity * (q + (r == 0 ? 0 : 1));
476 /* acols: The columns that will be accessed. */
477 /* scols: The columns that will be accessed or skipped. */
479 /* Our I/O request doesn't span all child vdevs. */
481 scols = MIN(dcols, roundup(bc, nparity + 1));
487 ASSERT3U(acols, <=, scols);
489 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
492 rm->rm_scols = scols;
494 rm->rm_skipstart = bc;
495 rm->rm_missingdata = 0;
496 rm->rm_missingparity = 0;
497 rm->rm_firstdatacol = nparity;
498 rm->rm_datacopy = NULL;
501 rm->rm_ecksuminjected = 0;
505 for (c = 0; c < scols; c++) {
510 coff += 1ULL << unit_shift;
512 rm->rm_col[c].rc_devidx = col;
513 rm->rm_col[c].rc_offset = coff;
514 rm->rm_col[c].rc_data = NULL;
515 rm->rm_col[c].rc_gdata = NULL;
516 rm->rm_col[c].rc_error = 0;
517 rm->rm_col[c].rc_tried = 0;
518 rm->rm_col[c].rc_skipped = 0;
521 rm->rm_col[c].rc_size = 0;
523 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
525 rm->rm_col[c].rc_size = q << unit_shift;
527 asize += rm->rm_col[c].rc_size;
530 ASSERT3U(asize, ==, tot << unit_shift);
531 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
532 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
533 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
534 ASSERT3U(rm->rm_nskip, <=, nparity);
536 if (zio->io_type != ZIO_TYPE_FREE) {
537 for (c = 0; c < rm->rm_firstdatacol; c++) {
538 rm->rm_col[c].rc_data =
539 zio_buf_alloc(rm->rm_col[c].rc_size);
542 rm->rm_col[c].rc_data = zio->io_data;
544 for (c = c + 1; c < acols; c++) {
545 rm->rm_col[c].rc_data =
546 (char *)rm->rm_col[c - 1].rc_data +
547 rm->rm_col[c - 1].rc_size;
552 * If all data stored spans all columns, there's a danger that parity
553 * will always be on the same device and, since parity isn't read
554 * during normal operation, that that device's I/O bandwidth won't be
555 * used effectively. We therefore switch the parity every 1MB.
557 * ... at least that was, ostensibly, the theory. As a practical
558 * matter unless we juggle the parity between all devices evenly, we
559 * won't see any benefit. Further, occasional writes that aren't a
560 * multiple of the LCM of the number of children and the minimum
561 * stripe width are sufficient to avoid pessimal behavior.
562 * Unfortunately, this decision created an implicit on-disk format
563 * requirement that we need to support for all eternity, but only
564 * for single-parity RAID-Z.
566 * If we intend to skip a sector in the zeroth column for padding
567 * we must make sure to note this swap. We will never intend to
568 * skip the first column since at least one data and one parity
569 * column must appear in each row.
571 ASSERT(rm->rm_cols >= 2);
572 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
574 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
575 devidx = rm->rm_col[0].rc_devidx;
576 o = rm->rm_col[0].rc_offset;
577 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
578 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
579 rm->rm_col[1].rc_devidx = devidx;
580 rm->rm_col[1].rc_offset = o;
582 if (rm->rm_skipstart == 0)
583 rm->rm_skipstart = 1;
587 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
592 vdev_raidz_generate_parity_p(raidz_map_t *rm)
594 uint64_t *p, *src, pcount, ccount, i;
597 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
599 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
600 src = rm->rm_col[c].rc_data;
601 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
602 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
604 if (c == rm->rm_firstdatacol) {
605 ASSERT(ccount == pcount);
606 for (i = 0; i < ccount; i++, src++, p++) {
610 ASSERT(ccount <= pcount);
611 for (i = 0; i < ccount; i++, src++, p++) {
619 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
621 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
624 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
625 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
626 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
628 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
629 src = rm->rm_col[c].rc_data;
630 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
631 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
633 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
635 if (c == rm->rm_firstdatacol) {
636 ASSERT(ccnt == pcnt || ccnt == 0);
637 for (i = 0; i < ccnt; i++, src++, p++, q++) {
641 for (; i < pcnt; i++, src++, p++, q++) {
646 ASSERT(ccnt <= pcnt);
649 * Apply the algorithm described above by multiplying
650 * the previous result and adding in the new value.
652 for (i = 0; i < ccnt; i++, src++, p++, q++) {
655 VDEV_RAIDZ_64MUL_2(*q, mask);
660 * Treat short columns as though they are full of 0s.
661 * Note that there's therefore nothing needed for P.
663 for (; i < pcnt; i++, q++) {
664 VDEV_RAIDZ_64MUL_2(*q, mask);
671 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
673 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
676 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
677 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
678 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
679 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
680 rm->rm_col[VDEV_RAIDZ_R].rc_size);
682 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
683 src = rm->rm_col[c].rc_data;
684 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
685 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
686 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
688 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
690 if (c == rm->rm_firstdatacol) {
691 ASSERT(ccnt == pcnt || ccnt == 0);
692 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
697 for (; i < pcnt; i++, src++, p++, q++, r++) {
703 ASSERT(ccnt <= pcnt);
706 * Apply the algorithm described above by multiplying
707 * the previous result and adding in the new value.
709 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
712 VDEV_RAIDZ_64MUL_2(*q, mask);
715 VDEV_RAIDZ_64MUL_4(*r, mask);
720 * Treat short columns as though they are full of 0s.
721 * Note that there's therefore nothing needed for P.
723 for (; i < pcnt; i++, q++, r++) {
724 VDEV_RAIDZ_64MUL_2(*q, mask);
725 VDEV_RAIDZ_64MUL_4(*r, mask);
732 * Generate RAID parity in the first virtual columns according to the number of
733 * parity columns available.
736 vdev_raidz_generate_parity(raidz_map_t *rm)
738 switch (rm->rm_firstdatacol) {
740 vdev_raidz_generate_parity_p(rm);
743 vdev_raidz_generate_parity_pq(rm);
746 vdev_raidz_generate_parity_pqr(rm);
749 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
754 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
756 uint64_t *dst, *src, xcount, ccount, count, i;
761 ASSERT(x >= rm->rm_firstdatacol);
762 ASSERT(x < rm->rm_cols);
764 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
765 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
768 src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
769 dst = rm->rm_col[x].rc_data;
770 for (i = 0; i < xcount; i++, dst++, src++) {
774 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
775 src = rm->rm_col[c].rc_data;
776 dst = rm->rm_col[x].rc_data;
781 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
782 count = MIN(ccount, xcount);
784 for (i = 0; i < count; i++, dst++, src++) {
789 return (1 << VDEV_RAIDZ_P);
793 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
795 uint64_t *dst, *src, xcount, ccount, count, mask, i;
802 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
803 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
805 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
806 src = rm->rm_col[c].rc_data;
807 dst = rm->rm_col[x].rc_data;
812 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
814 count = MIN(ccount, xcount);
816 if (c == rm->rm_firstdatacol) {
817 for (i = 0; i < count; i++, dst++, src++) {
820 for (; i < xcount; i++, dst++) {
825 for (i = 0; i < count; i++, dst++, src++) {
826 VDEV_RAIDZ_64MUL_2(*dst, mask);
830 for (; i < xcount; i++, dst++) {
831 VDEV_RAIDZ_64MUL_2(*dst, mask);
836 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
837 dst = rm->rm_col[x].rc_data;
838 exp = 255 - (rm->rm_cols - 1 - x);
840 for (i = 0; i < xcount; i++, dst++, src++) {
842 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
843 *b = vdev_raidz_exp2(*b, exp);
847 return (1 << VDEV_RAIDZ_Q);
851 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
853 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
855 uint64_t xsize, ysize, i;
861 ASSERT(x >= rm->rm_firstdatacol);
862 ASSERT(y < rm->rm_cols);
864 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
867 * Move the parity data aside -- we're going to compute parity as
868 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
869 * reuse the parity generation mechanism without trashing the actual
870 * parity so we make those columns appear to be full of zeros by
871 * setting their lengths to zero.
873 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
874 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
875 xsize = rm->rm_col[x].rc_size;
876 ysize = rm->rm_col[y].rc_size;
878 rm->rm_col[VDEV_RAIDZ_P].rc_data =
879 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
880 rm->rm_col[VDEV_RAIDZ_Q].rc_data =
881 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
882 rm->rm_col[x].rc_size = 0;
883 rm->rm_col[y].rc_size = 0;
885 vdev_raidz_generate_parity_pq(rm);
887 rm->rm_col[x].rc_size = xsize;
888 rm->rm_col[y].rc_size = ysize;
892 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
893 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
894 xd = rm->rm_col[x].rc_data;
895 yd = rm->rm_col[y].rc_data;
899 * Pxy = P + D_x + D_y
900 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
902 * We can then solve for D_x:
903 * D_x = A * (P + Pxy) + B * (Q + Qxy)
905 * A = 2^(x - y) * (2^(x - y) + 1)^-1
906 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
908 * With D_x in hand, we can easily solve for D_y:
909 * D_y = P + Pxy + D_x
912 a = vdev_raidz_pow2[255 + x - y];
913 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
914 tmp = 255 - vdev_raidz_log2[a ^ 1];
916 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
917 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
919 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
920 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
921 vdev_raidz_exp2(*q ^ *qxy, bexp);
924 *yd = *p ^ *pxy ^ *xd;
927 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
928 rm->rm_col[VDEV_RAIDZ_P].rc_size);
929 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
930 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
933 * Restore the saved parity data.
935 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
936 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
938 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
943 * In the general case of reconstruction, we must solve the system of linear
944 * equations defined by the coeffecients used to generate parity as well as
945 * the contents of the data and parity disks. This can be expressed with
946 * vectors for the original data (D) and the actual data (d) and parity (p)
947 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
951 * | V | | D_0 | | p_m-1 |
952 * | | x | : | = | d_0 |
953 * | I | | D_n-1 | | : |
954 * | | ~~ ~~ | d_n-1 |
957 * I is simply a square identity matrix of size n, and V is a vandermonde
958 * matrix defined by the coeffecients we chose for the various parity columns
959 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
960 * computation as well as linear separability.
963 * | 1 .. 1 1 1 | | p_0 |
964 * | 2^n-1 .. 4 2 1 | __ __ | : |
965 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
966 * | 1 .. 0 0 0 | | D_1 | | d_0 |
967 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
968 * | : : : : | | : | | d_2 |
969 * | 0 .. 1 0 0 | | D_n-1 | | : |
970 * | 0 .. 0 1 0 | ~~ ~~ | : |
971 * | 0 .. 0 0 1 | | d_n-1 |
974 * Note that I, V, d, and p are known. To compute D, we must invert the
975 * matrix and use the known data and parity values to reconstruct the unknown
976 * data values. We begin by removing the rows in V|I and d|p that correspond
977 * to failed or missing columns; we then make V|I square (n x n) and d|p
978 * sized n by removing rows corresponding to unused parity from the bottom up
979 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
980 * using Gauss-Jordan elimination. In the example below we use m=3 parity
981 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
983 * | 1 1 1 1 1 1 1 1 |
984 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
985 * | 19 205 116 29 64 16 4 1 | / /
986 * | 1 0 0 0 0 0 0 0 | / /
987 * | 0 1 0 0 0 0 0 0 | <--' /
988 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
989 * | 0 0 0 1 0 0 0 0 |
990 * | 0 0 0 0 1 0 0 0 |
991 * | 0 0 0 0 0 1 0 0 |
992 * | 0 0 0 0 0 0 1 0 |
993 * | 0 0 0 0 0 0 0 1 |
996 * | 1 1 1 1 1 1 1 1 |
997 * | 128 64 32 16 8 4 2 1 |
998 * | 19 205 116 29 64 16 4 1 |
999 * | 1 0 0 0 0 0 0 0 |
1000 * | 0 1 0 0 0 0 0 0 |
1001 * (V|I)' = | 0 0 1 0 0 0 0 0 |
1002 * | 0 0 0 1 0 0 0 0 |
1003 * | 0 0 0 0 1 0 0 0 |
1004 * | 0 0 0 0 0 1 0 0 |
1005 * | 0 0 0 0 0 0 1 0 |
1006 * | 0 0 0 0 0 0 0 1 |
1009 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1010 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1011 * matrix is not singular.
1013 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1014 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1015 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1016 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1017 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1018 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1019 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1020 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1023 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1024 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1025 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1026 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1027 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1028 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1029 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1030 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1033 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1034 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1035 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1036 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1037 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1038 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1039 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1040 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1043 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1044 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1045 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1046 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1047 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1048 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1049 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1050 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1053 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1054 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1055 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1056 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1057 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1058 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1059 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1060 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1063 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1064 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1065 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1066 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1067 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1068 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1069 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1070 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1073 * | 0 0 1 0 0 0 0 0 |
1074 * | 167 100 5 41 159 169 217 208 |
1075 * | 166 100 4 40 158 168 216 209 |
1076 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1077 * | 0 0 0 0 1 0 0 0 |
1078 * | 0 0 0 0 0 1 0 0 |
1079 * | 0 0 0 0 0 0 1 0 |
1080 * | 0 0 0 0 0 0 0 1 |
1083 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1084 * of the missing data.
1086 * As is apparent from the example above, the only non-trivial rows in the
1087 * inverse matrix correspond to the data disks that we're trying to
1088 * reconstruct. Indeed, those are the only rows we need as the others would
1089 * only be useful for reconstructing data known or assumed to be valid. For
1090 * that reason, we only build the coefficients in the rows that correspond to
1096 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1102 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1105 * Fill in the missing rows of interest.
1107 for (i = 0; i < nmap; i++) {
1108 ASSERT3S(0, <=, map[i]);
1109 ASSERT3S(map[i], <=, 2);
1116 for (j = 0; j < n; j++) {
1120 rows[i][j] = vdev_raidz_pow2[pow];
1126 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1127 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1133 * Assert that the first nmissing entries from the array of used
1134 * columns correspond to parity columns and that subsequent entries
1135 * correspond to data columns.
1137 for (i = 0; i < nmissing; i++) {
1138 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1140 for (; i < n; i++) {
1141 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1145 * First initialize the storage where we'll compute the inverse rows.
1147 for (i = 0; i < nmissing; i++) {
1148 for (j = 0; j < n; j++) {
1149 invrows[i][j] = (i == j) ? 1 : 0;
1154 * Subtract all trivial rows from the rows of consequence.
1156 for (i = 0; i < nmissing; i++) {
1157 for (j = nmissing; j < n; j++) {
1158 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1159 jj = used[j] - rm->rm_firstdatacol;
1161 invrows[i][j] = rows[i][jj];
1167 * For each of the rows of interest, we must normalize it and subtract
1168 * a multiple of it from the other rows.
1170 for (i = 0; i < nmissing; i++) {
1171 for (j = 0; j < missing[i]; j++) {
1172 ASSERT0(rows[i][j]);
1174 ASSERT3U(rows[i][missing[i]], !=, 0);
1177 * Compute the inverse of the first element and multiply each
1178 * element in the row by that value.
1180 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1182 for (j = 0; j < n; j++) {
1183 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1184 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1187 for (ii = 0; ii < nmissing; ii++) {
1191 ASSERT3U(rows[ii][missing[i]], !=, 0);
1193 log = vdev_raidz_log2[rows[ii][missing[i]]];
1195 for (j = 0; j < n; j++) {
1197 vdev_raidz_exp2(rows[i][j], log);
1199 vdev_raidz_exp2(invrows[i][j], log);
1205 * Verify that the data that is left in the rows are properly part of
1206 * an identity matrix.
1208 for (i = 0; i < nmissing; i++) {
1209 for (j = 0; j < n; j++) {
1210 if (j == missing[i]) {
1211 ASSERT3U(rows[i][j], ==, 1);
1213 ASSERT0(rows[i][j]);
1220 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1221 int *missing, uint8_t **invrows, const uint8_t *used)
1226 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1227 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1231 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1235 psize = sizeof (invlog[0][0]) * n * nmissing;
1236 p = kmem_alloc(psize, KM_SLEEP);
1238 for (pp = p, i = 0; i < nmissing; i++) {
1243 for (i = 0; i < nmissing; i++) {
1244 for (j = 0; j < n; j++) {
1245 ASSERT3U(invrows[i][j], !=, 0);
1246 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1250 for (i = 0; i < n; i++) {
1252 ASSERT3U(c, <, rm->rm_cols);
1254 src = rm->rm_col[c].rc_data;
1255 ccount = rm->rm_col[c].rc_size;
1256 for (j = 0; j < nmissing; j++) {
1257 cc = missing[j] + rm->rm_firstdatacol;
1258 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1259 ASSERT3U(cc, <, rm->rm_cols);
1260 ASSERT3U(cc, !=, c);
1262 dst[j] = rm->rm_col[cc].rc_data;
1263 dcount[j] = rm->rm_col[cc].rc_size;
1266 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1268 for (x = 0; x < ccount; x++, src++) {
1270 log = vdev_raidz_log2[*src];
1272 for (cc = 0; cc < nmissing; cc++) {
1273 if (x >= dcount[cc])
1279 if ((ll = log + invlog[cc][i]) >= 255)
1281 val = vdev_raidz_pow2[ll];
1292 kmem_free(p, psize);
1296 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1300 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1301 int parity_map[VDEV_RAIDZ_MAXPARITY];
1306 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1307 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1313 n = rm->rm_cols - rm->rm_firstdatacol;
1316 * Figure out which data columns are missing.
1319 for (t = 0; t < ntgts; t++) {
1320 if (tgts[t] >= rm->rm_firstdatacol) {
1321 missing_rows[nmissing_rows++] =
1322 tgts[t] - rm->rm_firstdatacol;
1327 * Figure out which parity columns to use to help generate the missing
1330 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1332 ASSERT(c < rm->rm_firstdatacol);
1335 * Skip any targeted parity columns.
1337 if (c == tgts[tt]) {
1349 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1351 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1352 nmissing_rows * n + sizeof (used[0]) * n;
1353 p = kmem_alloc(psize, KM_SLEEP);
1355 for (pp = p, i = 0; i < nmissing_rows; i++) {
1363 for (i = 0; i < nmissing_rows; i++) {
1364 used[i] = parity_map[i];
1367 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1368 if (tt < nmissing_rows &&
1369 c == missing_rows[tt] + rm->rm_firstdatacol) {
1380 * Initialize the interesting rows of the matrix.
1382 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1385 * Invert the matrix.
1387 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1391 * Reconstruct the missing data using the generated matrix.
1393 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1396 kmem_free(p, psize);
1402 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1404 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1408 int nbadparity, nbaddata;
1409 int parity_valid[VDEV_RAIDZ_MAXPARITY];
1412 * The tgts list must already be sorted.
1414 for (i = 1; i < nt; i++) {
1415 ASSERT(t[i] > t[i - 1]);
1418 nbadparity = rm->rm_firstdatacol;
1419 nbaddata = rm->rm_cols - nbadparity;
1421 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1422 if (c < rm->rm_firstdatacol)
1423 parity_valid[c] = B_FALSE;
1425 if (i < nt && c == t[i]) {
1428 } else if (rm->rm_col[c].rc_error != 0) {
1430 } else if (c >= rm->rm_firstdatacol) {
1433 parity_valid[c] = B_TRUE;
1438 ASSERT(ntgts >= nt);
1439 ASSERT(nbaddata >= 0);
1440 ASSERT(nbaddata + nbadparity == ntgts);
1442 dt = &tgts[nbadparity];
1445 * See if we can use any of our optimized reconstruction routines.
1447 if (!vdev_raidz_default_to_general) {
1450 if (parity_valid[VDEV_RAIDZ_P])
1451 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1453 ASSERT(rm->rm_firstdatacol > 1);
1455 if (parity_valid[VDEV_RAIDZ_Q])
1456 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1458 ASSERT(rm->rm_firstdatacol > 2);
1462 ASSERT(rm->rm_firstdatacol > 1);
1464 if (parity_valid[VDEV_RAIDZ_P] &&
1465 parity_valid[VDEV_RAIDZ_Q])
1466 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1468 ASSERT(rm->rm_firstdatacol > 2);
1474 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1475 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1481 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1485 uint64_t nparity = vd->vdev_nparity;
1490 ASSERT(nparity > 0);
1492 if (nparity > VDEV_RAIDZ_MAXPARITY ||
1493 vd->vdev_children < nparity + 1) {
1494 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1495 return (SET_ERROR(EINVAL));
1498 vdev_open_children(vd);
1500 for (c = 0; c < vd->vdev_children; c++) {
1501 cvd = vd->vdev_child[c];
1503 if (cvd->vdev_open_error != 0) {
1504 lasterror = cvd->vdev_open_error;
1509 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1510 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1511 *ashift = MAX(*ashift, cvd->vdev_ashift);
1514 *asize *= vd->vdev_children;
1515 *max_asize *= vd->vdev_children;
1517 if (numerrors > nparity) {
1518 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1526 vdev_raidz_close(vdev_t *vd)
1530 for (c = 0; c < vd->vdev_children; c++)
1531 vdev_close(vd->vdev_child[c]);
1535 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1538 uint64_t ashift = vd->vdev_top->vdev_ashift;
1539 uint64_t cols = vd->vdev_children;
1540 uint64_t nparity = vd->vdev_nparity;
1542 asize = ((psize - 1) >> ashift) + 1;
1543 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1544 asize = roundup(asize, nparity + 1) << ashift;
1550 vdev_raidz_child_done(zio_t *zio)
1552 raidz_col_t *rc = zio->io_private;
1554 rc->rc_error = zio->io_error;
1560 * Start an IO operation on a RAIDZ VDev
1563 * - For write operations:
1564 * 1. Generate the parity data
1565 * 2. Create child zio write operations to each column's vdev, for both
1567 * 3. If the column skips any sectors for padding, create optional dummy
1568 * write zio children for those areas to improve aggregation continuity.
1569 * - For read operations:
1570 * 1. Create child zio read operations to each data column's vdev to read
1571 * the range of data required for zio.
1572 * 2. If this is a scrub or resilver operation, or if any of the data
1573 * vdevs have had errors, then create zio read operations to the parity
1574 * columns' VDevs as well.
1577 vdev_raidz_io_start(zio_t *zio)
1579 vdev_t *vd = zio->io_vd;
1580 vdev_t *tvd = vd->vdev_top;
1586 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1589 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1591 if (zio->io_type == ZIO_TYPE_FREE) {
1592 for (c = 0; c < rm->rm_cols; c++) {
1593 rc = &rm->rm_col[c];
1594 cvd = vd->vdev_child[rc->rc_devidx];
1595 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1596 rc->rc_offset, rc->rc_data, rc->rc_size,
1597 zio->io_type, zio->io_priority, 0,
1598 vdev_raidz_child_done, rc));
1600 return (ZIO_PIPELINE_CONTINUE);
1603 if (zio->io_type == ZIO_TYPE_WRITE) {
1604 vdev_raidz_generate_parity(rm);
1606 for (c = 0; c < rm->rm_cols; c++) {
1607 rc = &rm->rm_col[c];
1608 cvd = vd->vdev_child[rc->rc_devidx];
1609 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1610 rc->rc_offset, rc->rc_data, rc->rc_size,
1611 zio->io_type, zio->io_priority, 0,
1612 vdev_raidz_child_done, rc));
1616 * Generate optional I/Os for any skipped sectors to improve
1617 * aggregation contiguity.
1619 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1620 ASSERT(c <= rm->rm_scols);
1621 if (c == rm->rm_scols)
1623 rc = &rm->rm_col[c];
1624 cvd = vd->vdev_child[rc->rc_devidx];
1625 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1626 rc->rc_offset + rc->rc_size, NULL,
1627 1 << tvd->vdev_ashift,
1628 zio->io_type, zio->io_priority,
1629 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1632 return (ZIO_PIPELINE_CONTINUE);
1635 ASSERT(zio->io_type == ZIO_TYPE_READ);
1638 * Iterate over the columns in reverse order so that we hit the parity
1639 * last -- any errors along the way will force us to read the parity.
1641 for (c = rm->rm_cols - 1; c >= 0; c--) {
1642 rc = &rm->rm_col[c];
1643 cvd = vd->vdev_child[rc->rc_devidx];
1644 if (!vdev_readable(cvd)) {
1645 if (c >= rm->rm_firstdatacol)
1646 rm->rm_missingdata++;
1648 rm->rm_missingparity++;
1649 rc->rc_error = SET_ERROR(ENXIO);
1650 rc->rc_tried = 1; /* don't even try */
1654 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1655 if (c >= rm->rm_firstdatacol)
1656 rm->rm_missingdata++;
1658 rm->rm_missingparity++;
1659 rc->rc_error = SET_ERROR(ESTALE);
1663 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1664 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1665 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1666 rc->rc_offset, rc->rc_data, rc->rc_size,
1667 zio->io_type, zio->io_priority, 0,
1668 vdev_raidz_child_done, rc));
1672 return (ZIO_PIPELINE_CONTINUE);
1677 * Report a checksum error for a child of a RAID-Z device.
1680 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1682 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1684 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1685 zio_bad_cksum_t zbc;
1686 raidz_map_t *rm = zio->io_vsd;
1688 mutex_enter(&vd->vdev_stat_lock);
1689 vd->vdev_stat.vs_checksum_errors++;
1690 mutex_exit(&vd->vdev_stat_lock);
1692 zbc.zbc_has_cksum = 0;
1693 zbc.zbc_injected = rm->rm_ecksuminjected;
1695 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1696 rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1702 * We keep track of whether or not there were any injected errors, so that
1703 * any ereports we generate can note it.
1706 raidz_checksum_verify(zio_t *zio)
1708 zio_bad_cksum_t zbc;
1709 raidz_map_t *rm = zio->io_vsd;
1711 int ret = zio_checksum_error(zio, &zbc);
1712 if (ret != 0 && zbc.zbc_injected != 0)
1713 rm->rm_ecksuminjected = 1;
1719 * Generate the parity from the data columns. If we tried and were able to
1720 * read the parity without error, verify that the generated parity matches the
1721 * data we read. If it doesn't, we fire off a checksum error. Return the
1722 * number such failures.
1725 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1727 void *orig[VDEV_RAIDZ_MAXPARITY];
1731 for (c = 0; c < rm->rm_firstdatacol; c++) {
1732 rc = &rm->rm_col[c];
1733 if (!rc->rc_tried || rc->rc_error != 0)
1735 orig[c] = zio_buf_alloc(rc->rc_size);
1736 bcopy(rc->rc_data, orig[c], rc->rc_size);
1739 vdev_raidz_generate_parity(rm);
1741 for (c = 0; c < rm->rm_firstdatacol; c++) {
1742 rc = &rm->rm_col[c];
1743 if (!rc->rc_tried || rc->rc_error != 0)
1745 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1746 raidz_checksum_error(zio, rc, orig[c]);
1747 rc->rc_error = SET_ERROR(ECKSUM);
1750 zio_buf_free(orig[c], rc->rc_size);
1757 * Keep statistics on all the ways that we used parity to correct data.
1759 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1762 vdev_raidz_worst_error(raidz_map_t *rm)
1766 for (int c = 0; c < rm->rm_cols; c++)
1767 error = zio_worst_error(error, rm->rm_col[c].rc_error);
1773 * Iterate over all combinations of bad data and attempt a reconstruction.
1774 * Note that the algorithm below is non-optimal because it doesn't take into
1775 * account how reconstruction is actually performed. For example, with
1776 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1777 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1778 * cases we'd only use parity information in column 0.
1781 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1783 raidz_map_t *rm = zio->io_vsd;
1785 void *orig[VDEV_RAIDZ_MAXPARITY];
1786 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1787 int *tgts = &tstore[1];
1788 int current, next, i, c, n;
1791 ASSERT(total_errors < rm->rm_firstdatacol);
1794 * This simplifies one edge condition.
1798 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1800 * Initialize the targets array by finding the first n columns
1801 * that contain no error.
1803 * If there were no data errors, we need to ensure that we're
1804 * always explicitly attempting to reconstruct at least one
1805 * data column. To do this, we simply push the highest target
1806 * up into the data columns.
1808 for (c = 0, i = 0; i < n; i++) {
1809 if (i == n - 1 && data_errors == 0 &&
1810 c < rm->rm_firstdatacol) {
1811 c = rm->rm_firstdatacol;
1814 while (rm->rm_col[c].rc_error != 0) {
1816 ASSERT3S(c, <, rm->rm_cols);
1823 * Setting tgts[n] simplifies the other edge condition.
1825 tgts[n] = rm->rm_cols;
1828 * These buffers were allocated in previous iterations.
1830 for (i = 0; i < n - 1; i++) {
1831 ASSERT(orig[i] != NULL);
1834 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1837 next = tgts[current];
1839 while (current != n) {
1840 tgts[current] = next;
1844 * Save off the original data that we're going to
1845 * attempt to reconstruct.
1847 for (i = 0; i < n; i++) {
1848 ASSERT(orig[i] != NULL);
1851 ASSERT3S(c, <, rm->rm_cols);
1852 rc = &rm->rm_col[c];
1853 bcopy(rc->rc_data, orig[i], rc->rc_size);
1857 * Attempt a reconstruction and exit the outer loop on
1860 code = vdev_raidz_reconstruct(rm, tgts, n);
1861 if (raidz_checksum_verify(zio) == 0) {
1862 atomic_inc_64(&raidz_corrected[code]);
1864 for (i = 0; i < n; i++) {
1866 rc = &rm->rm_col[c];
1867 ASSERT(rc->rc_error == 0);
1869 raidz_checksum_error(zio, rc,
1871 rc->rc_error = SET_ERROR(ECKSUM);
1879 * Restore the original data.
1881 for (i = 0; i < n; i++) {
1883 rc = &rm->rm_col[c];
1884 bcopy(orig[i], rc->rc_data, rc->rc_size);
1889 * Find the next valid column after the current
1892 for (next = tgts[current] + 1;
1893 next < rm->rm_cols &&
1894 rm->rm_col[next].rc_error != 0; next++)
1897 ASSERT(next <= tgts[current + 1]);
1900 * If that spot is available, we're done here.
1902 if (next != tgts[current + 1])
1906 * Otherwise, find the next valid column after
1907 * the previous position.
1909 for (c = tgts[current - 1] + 1;
1910 rm->rm_col[c].rc_error != 0; c++)
1916 } while (current != n);
1921 for (i = 0; i < n; i++) {
1922 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1929 * Complete an IO operation on a RAIDZ VDev
1932 * - For write operations:
1933 * 1. Check for errors on the child IOs.
1934 * 2. Return, setting an error code if too few child VDevs were written
1935 * to reconstruct the data later. Note that partial writes are
1936 * considered successful if they can be reconstructed at all.
1937 * - For read operations:
1938 * 1. Check for errors on the child IOs.
1939 * 2. If data errors occurred:
1940 * a. Try to reassemble the data from the parity available.
1941 * b. If we haven't yet read the parity drives, read them now.
1942 * c. If all parity drives have been read but the data still doesn't
1943 * reassemble with a correct checksum, then try combinatorial
1945 * d. If that doesn't work, return an error.
1946 * 3. If there were unexpected errors or this is a resilver operation,
1947 * rewrite the vdevs that had errors.
1950 vdev_raidz_io_done(zio_t *zio)
1952 vdev_t *vd = zio->io_vd;
1954 raidz_map_t *rm = zio->io_vsd;
1956 int unexpected_errors = 0;
1957 int parity_errors = 0;
1958 int parity_untried = 0;
1959 int data_errors = 0;
1960 int total_errors = 0;
1962 int tgts[VDEV_RAIDZ_MAXPARITY];
1965 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
1967 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1968 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1970 for (c = 0; c < rm->rm_cols; c++) {
1971 rc = &rm->rm_col[c];
1974 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
1976 if (c < rm->rm_firstdatacol)
1981 if (!rc->rc_skipped)
1982 unexpected_errors++;
1985 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1990 if (zio->io_type == ZIO_TYPE_WRITE) {
1992 * XXX -- for now, treat partial writes as a success.
1993 * (If we couldn't write enough columns to reconstruct
1994 * the data, the I/O failed. Otherwise, good enough.)
1996 * Now that we support write reallocation, it would be better
1997 * to treat partial failure as real failure unless there are
1998 * no non-degraded top-level vdevs left, and not update DTLs
1999 * if we intend to reallocate.
2002 if (total_errors > rm->rm_firstdatacol)
2003 zio->io_error = vdev_raidz_worst_error(rm);
2006 } else if (zio->io_type == ZIO_TYPE_FREE) {
2010 ASSERT(zio->io_type == ZIO_TYPE_READ);
2012 * There are three potential phases for a read:
2013 * 1. produce valid data from the columns read
2014 * 2. read all disks and try again
2015 * 3. perform combinatorial reconstruction
2017 * Each phase is progressively both more expensive and less likely to
2018 * occur. If we encounter more errors than we can repair or all phases
2019 * fail, we have no choice but to return an error.
2023 * If the number of errors we saw was correctable -- less than or equal
2024 * to the number of parity disks read -- attempt to produce data that
2025 * has a valid checksum. Naturally, this case applies in the absence of
2028 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2029 if (data_errors == 0) {
2030 if (raidz_checksum_verify(zio) == 0) {
2032 * If we read parity information (unnecessarily
2033 * as it happens since no reconstruction was
2034 * needed) regenerate and verify the parity.
2035 * We also regenerate parity when resilvering
2036 * so we can write it out to the failed device
2039 if (parity_errors + parity_untried <
2040 rm->rm_firstdatacol ||
2041 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2042 n = raidz_parity_verify(zio, rm);
2043 unexpected_errors += n;
2044 ASSERT(parity_errors + n <=
2045 rm->rm_firstdatacol);
2051 * We either attempt to read all the parity columns or
2052 * none of them. If we didn't try to read parity, we
2053 * wouldn't be here in the correctable case. There must
2054 * also have been fewer parity errors than parity
2055 * columns or, again, we wouldn't be in this code path.
2057 ASSERT(parity_untried == 0);
2058 ASSERT(parity_errors < rm->rm_firstdatacol);
2061 * Identify the data columns that reported an error.
2064 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2065 rc = &rm->rm_col[c];
2066 if (rc->rc_error != 0) {
2067 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2072 ASSERT(rm->rm_firstdatacol >= n);
2074 code = vdev_raidz_reconstruct(rm, tgts, n);
2076 if (raidz_checksum_verify(zio) == 0) {
2077 atomic_inc_64(&raidz_corrected[code]);
2080 * If we read more parity disks than were used
2081 * for reconstruction, confirm that the other
2082 * parity disks produced correct data. This
2083 * routine is suboptimal in that it regenerates
2084 * the parity that we already used in addition
2085 * to the parity that we're attempting to
2086 * verify, but this should be a relatively
2087 * uncommon case, and can be optimized if it
2088 * becomes a problem. Note that we regenerate
2089 * parity when resilvering so we can write it
2090 * out to failed devices later.
2092 if (parity_errors < rm->rm_firstdatacol - n ||
2093 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2094 n = raidz_parity_verify(zio, rm);
2095 unexpected_errors += n;
2096 ASSERT(parity_errors + n <=
2097 rm->rm_firstdatacol);
2106 * This isn't a typical situation -- either we got a read error or
2107 * a child silently returned bad data. Read every block so we can
2108 * try again with as much data and parity as we can track down. If
2109 * we've already been through once before, all children will be marked
2110 * as tried so we'll proceed to combinatorial reconstruction.
2112 unexpected_errors = 1;
2113 rm->rm_missingdata = 0;
2114 rm->rm_missingparity = 0;
2116 for (c = 0; c < rm->rm_cols; c++) {
2117 if (rm->rm_col[c].rc_tried)
2120 zio_vdev_io_redone(zio);
2122 rc = &rm->rm_col[c];
2125 zio_nowait(zio_vdev_child_io(zio, NULL,
2126 vd->vdev_child[rc->rc_devidx],
2127 rc->rc_offset, rc->rc_data, rc->rc_size,
2128 zio->io_type, zio->io_priority, 0,
2129 vdev_raidz_child_done, rc));
2130 } while (++c < rm->rm_cols);
2136 * At this point we've attempted to reconstruct the data given the
2137 * errors we detected, and we've attempted to read all columns. There
2138 * must, therefore, be one or more additional problems -- silent errors
2139 * resulting in invalid data rather than explicit I/O errors resulting
2140 * in absent data. We check if there is enough additional data to
2141 * possibly reconstruct the data and then perform combinatorial
2142 * reconstruction over all possible combinations. If that fails,
2145 if (total_errors > rm->rm_firstdatacol) {
2146 zio->io_error = vdev_raidz_worst_error(rm);
2148 } else if (total_errors < rm->rm_firstdatacol &&
2149 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2151 * If we didn't use all the available parity for the
2152 * combinatorial reconstruction, verify that the remaining
2153 * parity is correct.
2155 if (code != (1 << rm->rm_firstdatacol) - 1)
2156 (void) raidz_parity_verify(zio, rm);
2159 * We're here because either:
2161 * total_errors == rm_first_datacol, or
2162 * vdev_raidz_combrec() failed
2164 * In either case, there is enough bad data to prevent
2167 * Start checksum ereports for all children which haven't
2168 * failed, and the IO wasn't speculative.
2170 zio->io_error = SET_ERROR(ECKSUM);
2172 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2173 for (c = 0; c < rm->rm_cols; c++) {
2174 rc = &rm->rm_col[c];
2175 if (rc->rc_error == 0) {
2176 zio_bad_cksum_t zbc;
2177 zbc.zbc_has_cksum = 0;
2179 rm->rm_ecksuminjected;
2181 zfs_ereport_start_checksum(
2183 vd->vdev_child[rc->rc_devidx],
2184 zio, rc->rc_offset, rc->rc_size,
2185 (void *)(uintptr_t)c, &zbc);
2192 zio_checksum_verified(zio);
2194 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2195 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2197 * Use the good data we have in hand to repair damaged children.
2199 for (c = 0; c < rm->rm_cols; c++) {
2200 rc = &rm->rm_col[c];
2201 cvd = vd->vdev_child[rc->rc_devidx];
2203 if (rc->rc_error == 0)
2206 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2207 rc->rc_offset, rc->rc_data, rc->rc_size,
2208 ZIO_TYPE_WRITE, zio->io_priority,
2209 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2210 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2216 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2218 if (faulted > vd->vdev_nparity)
2219 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2220 VDEV_AUX_NO_REPLICAS);
2221 else if (degraded + faulted != 0)
2222 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2224 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2227 vdev_ops_t vdev_raidz_ops = {
2231 vdev_raidz_io_start,
2233 vdev_raidz_state_change,
2236 VDEV_TYPE_RAIDZ, /* name of this vdev type */
2237 B_FALSE /* not a leaf vdev */