4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26 * Copyright (c) 2014 Integros [integros.com]
29 #include <sys/zfs_context.h>
31 #include <sys/vdev_impl.h>
33 #include <sys/vdev_disk.h>
35 #include <sys/vdev_file.h>
36 #include <sys/vdev_raidz.h>
38 #include <sys/zio_checksum.h>
39 #include <sys/fs/zfs.h>
40 #include <sys/fm/fs/zfs.h>
44 * Virtual device vector for RAID-Z.
46 * This vdev supports single, double, and triple parity. For single parity,
47 * we use a simple XOR of all the data columns. For double or triple parity,
48 * we use a special case of Reed-Solomon coding. This extends the
49 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
50 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
51 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
52 * former is also based. The latter is designed to provide higher performance
55 * Note that the Plank paper claimed to support arbitrary N+M, but was then
56 * amended six years later identifying a critical flaw that invalidates its
57 * claims. Nevertheless, the technique can be adapted to work for up to
58 * triple parity. For additional parity, the amendment "Note: Correction to
59 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
60 * is viable, but the additional complexity means that write performance will
63 * All of the methods above operate on a Galois field, defined over the
64 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
65 * can be expressed with a single byte. Briefly, the operations on the
66 * field are defined as follows:
68 * o addition (+) is represented by a bitwise XOR
69 * o subtraction (-) is therefore identical to addition: A + B = A - B
70 * o multiplication of A by 2 is defined by the following bitwise expression:
75 * (A * 2)_4 = A_3 + A_7
76 * (A * 2)_3 = A_2 + A_7
77 * (A * 2)_2 = A_1 + A_7
81 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
82 * As an aside, this multiplication is derived from the error correcting
83 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
85 * Observe that any number in the field (except for 0) can be expressed as a
86 * power of 2 -- a generator for the field. We store a table of the powers of
87 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
88 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
89 * than field addition). The inverse of a field element A (A^-1) is therefore
90 * A ^ (255 - 1) = A^254.
92 * The up-to-three parity columns, P, Q, R over several data columns,
93 * D_0, ... D_n-1, can be expressed by field operations:
95 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
96 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
97 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
98 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
99 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
101 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
102 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
103 * independent coefficients. (There are no additional coefficients that have
104 * this property which is why the uncorrected Plank method breaks down.)
106 * See the reconstruction code below for how P, Q and R can used individually
107 * or in concert to recover missing data columns.
110 typedef struct raidz_col {
111 uint64_t rc_devidx; /* child device index for I/O */
112 uint64_t rc_offset; /* device offset */
113 uint64_t rc_size; /* I/O size */
114 void *rc_data; /* I/O data */
115 void *rc_gdata; /* used to store the "good" version */
116 int rc_error; /* I/O error for this device */
117 uint8_t rc_tried; /* Did we attempt this I/O column? */
118 uint8_t rc_skipped; /* Did we skip this I/O column? */
121 typedef struct raidz_map {
122 uint64_t rm_cols; /* Regular column count */
123 uint64_t rm_scols; /* Count including skipped columns */
124 uint64_t rm_bigcols; /* Number of oversized columns */
125 uint64_t rm_asize; /* Actual total I/O size */
126 uint64_t rm_missingdata; /* Count of missing data devices */
127 uint64_t rm_missingparity; /* Count of missing parity devices */
128 uint64_t rm_firstdatacol; /* First data column/parity count */
129 uint64_t rm_nskip; /* Skipped sectors for padding */
130 uint64_t rm_skipstart; /* Column index of padding start */
131 void *rm_datacopy; /* rm_asize-buffer of copied data */
132 uintptr_t rm_reports; /* # of referencing checksum reports */
133 uint8_t rm_freed; /* map no longer has referencing ZIO */
134 uint8_t rm_ecksuminjected; /* checksum error was injected */
135 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
138 #define VDEV_RAIDZ_P 0
139 #define VDEV_RAIDZ_Q 1
140 #define VDEV_RAIDZ_R 2
142 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
143 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
146 * We provide a mechanism to perform the field multiplication operation on a
147 * 64-bit value all at once rather than a byte at a time. This works by
148 * creating a mask from the top bit in each byte and using that to
149 * conditionally apply the XOR of 0x1d.
151 #define VDEV_RAIDZ_64MUL_2(x, mask) \
153 (mask) = (x) & 0x8080808080808080ULL; \
154 (mask) = ((mask) << 1) - ((mask) >> 7); \
155 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
156 ((mask) & 0x1d1d1d1d1d1d1d1d); \
159 #define VDEV_RAIDZ_64MUL_4(x, mask) \
161 VDEV_RAIDZ_64MUL_2((x), mask); \
162 VDEV_RAIDZ_64MUL_2((x), mask); \
165 #define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE)
168 * Force reconstruction to use the general purpose method.
170 int vdev_raidz_default_to_general;
172 /* Powers of 2 in the Galois field defined above. */
173 static const uint8_t vdev_raidz_pow2[256] = {
174 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
175 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
176 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
177 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
178 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
179 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
180 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
181 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
182 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
183 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
184 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
185 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
186 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
187 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
188 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
189 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
190 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
191 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
192 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
193 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
194 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
195 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
196 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
197 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
198 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
199 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
200 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
201 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
202 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
203 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
204 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
205 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
207 /* Logs of 2 in the Galois field defined above. */
208 static const uint8_t vdev_raidz_log2[256] = {
209 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
210 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
211 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
212 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
213 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
214 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
215 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
216 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
217 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
218 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
219 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
220 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
221 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
222 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
223 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
224 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
225 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
226 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
227 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
228 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
229 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
230 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
231 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
232 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
233 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
234 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
235 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
236 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
237 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
238 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
239 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
240 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
243 static void vdev_raidz_generate_parity(raidz_map_t *rm);
246 * Multiply a given number by 2 raised to the given power.
249 vdev_raidz_exp2(uint_t a, int exp)
255 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
257 exp += vdev_raidz_log2[a];
261 return (vdev_raidz_pow2[exp]);
265 vdev_raidz_map_free(raidz_map_t *rm)
270 for (c = 0; c < rm->rm_firstdatacol; c++) {
271 if (rm->rm_col[c].rc_data != NULL)
272 zio_buf_free(rm->rm_col[c].rc_data,
273 rm->rm_col[c].rc_size);
275 if (rm->rm_col[c].rc_gdata != NULL)
276 zio_buf_free(rm->rm_col[c].rc_gdata,
277 rm->rm_col[c].rc_size);
281 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
282 size += rm->rm_col[c].rc_size;
284 if (rm->rm_datacopy != NULL)
285 zio_buf_free(rm->rm_datacopy, size);
287 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
291 vdev_raidz_map_free_vsd(zio_t *zio)
293 raidz_map_t *rm = zio->io_vsd;
295 ASSERT0(rm->rm_freed);
298 if (rm->rm_reports == 0)
299 vdev_raidz_map_free(rm);
304 vdev_raidz_cksum_free(void *arg, size_t ignored)
306 raidz_map_t *rm = arg;
308 ASSERT3U(rm->rm_reports, >, 0);
310 if (--rm->rm_reports == 0 && rm->rm_freed != 0)
311 vdev_raidz_map_free(rm);
315 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
317 raidz_map_t *rm = zcr->zcr_cbdata;
318 size_t c = zcr->zcr_cbinfo;
321 const char *good = NULL;
322 const char *bad = rm->rm_col[c].rc_data;
324 if (good_data == NULL) {
325 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
329 if (c < rm->rm_firstdatacol) {
331 * The first time through, calculate the parity blocks for
332 * the good data (this relies on the fact that the good
333 * data never changes for a given logical ZIO)
335 if (rm->rm_col[0].rc_gdata == NULL) {
336 char *bad_parity[VDEV_RAIDZ_MAXPARITY];
340 * Set up the rm_col[]s to generate the parity for
341 * good_data, first saving the parity bufs and
342 * replacing them with buffers to hold the result.
344 for (x = 0; x < rm->rm_firstdatacol; x++) {
345 bad_parity[x] = rm->rm_col[x].rc_data;
346 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
347 zio_buf_alloc(rm->rm_col[x].rc_size);
350 /* fill in the data columns from good_data */
351 buf = (char *)good_data;
352 for (; x < rm->rm_cols; x++) {
353 rm->rm_col[x].rc_data = buf;
354 buf += rm->rm_col[x].rc_size;
358 * Construct the parity from the good data.
360 vdev_raidz_generate_parity(rm);
362 /* restore everything back to its original state */
363 for (x = 0; x < rm->rm_firstdatacol; x++)
364 rm->rm_col[x].rc_data = bad_parity[x];
366 buf = rm->rm_datacopy;
367 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
368 rm->rm_col[x].rc_data = buf;
369 buf += rm->rm_col[x].rc_size;
373 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
374 good = rm->rm_col[c].rc_gdata;
376 /* adjust good_data to point at the start of our column */
379 for (x = rm->rm_firstdatacol; x < c; x++)
380 good += rm->rm_col[x].rc_size;
383 /* we drop the ereport if it ends up that the data was good */
384 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
388 * Invoked indirectly by zfs_ereport_start_checksum(), called
389 * below when our read operation fails completely. The main point
390 * is to keep a copy of everything we read from disk, so that at
391 * vdev_raidz_cksum_finish() time we can compare it with the good data.
394 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
396 size_t c = (size_t)(uintptr_t)arg;
399 raidz_map_t *rm = zio->io_vsd;
402 /* set up the report and bump the refcount */
403 zcr->zcr_cbdata = rm;
405 zcr->zcr_finish = vdev_raidz_cksum_finish;
406 zcr->zcr_free = vdev_raidz_cksum_free;
409 ASSERT3U(rm->rm_reports, >, 0);
411 if (rm->rm_datacopy != NULL)
415 * It's the first time we're called for this raidz_map_t, so we need
416 * to copy the data aside; there's no guarantee that our zio's buffer
417 * won't be re-used for something else.
419 * Our parity data is already in separate buffers, so there's no need
424 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
425 size += rm->rm_col[c].rc_size;
427 buf = rm->rm_datacopy = zio_buf_alloc(size);
429 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
430 raidz_col_t *col = &rm->rm_col[c];
432 bcopy(col->rc_data, buf, col->rc_size);
437 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
440 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
441 vdev_raidz_map_free_vsd,
442 vdev_raidz_cksum_report
446 * Divides the IO evenly across all child vdevs; usually, dcols is
447 * the number of children in the target vdev.
450 vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree,
451 uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
454 /* The starting RAIDZ (parent) vdev sector of the block. */
455 uint64_t b = offset >> unit_shift;
456 /* The zio's size in units of the vdev's minimum sector size. */
457 uint64_t s = size >> unit_shift;
458 /* The first column for this stripe. */
459 uint64_t f = b % dcols;
460 /* The starting byte offset on each child vdev. */
461 uint64_t o = (b / dcols) << unit_shift;
462 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
465 * "Quotient": The number of data sectors for this stripe on all but
466 * the "big column" child vdevs that also contain "remainder" data.
468 q = s / (dcols - nparity);
471 * "Remainder": The number of partial stripe data sectors in this I/O.
472 * This will add a sector to some, but not all, child vdevs.
474 r = s - q * (dcols - nparity);
476 /* The number of "big columns" - those which contain remainder data. */
477 bc = (r == 0 ? 0 : r + nparity);
480 * The total number of data and parity sectors associated with
483 tot = s + nparity * (q + (r == 0 ? 0 : 1));
485 /* acols: The columns that will be accessed. */
486 /* scols: The columns that will be accessed or skipped. */
488 /* Our I/O request doesn't span all child vdevs. */
490 scols = MIN(dcols, roundup(bc, nparity + 1));
496 ASSERT3U(acols, <=, scols);
498 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
501 rm->rm_scols = scols;
503 rm->rm_skipstart = bc;
504 rm->rm_missingdata = 0;
505 rm->rm_missingparity = 0;
506 rm->rm_firstdatacol = nparity;
507 rm->rm_datacopy = NULL;
510 rm->rm_ecksuminjected = 0;
514 for (c = 0; c < scols; c++) {
519 coff += 1ULL << unit_shift;
521 rm->rm_col[c].rc_devidx = col;
522 rm->rm_col[c].rc_offset = coff;
523 rm->rm_col[c].rc_data = NULL;
524 rm->rm_col[c].rc_gdata = NULL;
525 rm->rm_col[c].rc_error = 0;
526 rm->rm_col[c].rc_tried = 0;
527 rm->rm_col[c].rc_skipped = 0;
530 rm->rm_col[c].rc_size = 0;
532 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
534 rm->rm_col[c].rc_size = q << unit_shift;
536 asize += rm->rm_col[c].rc_size;
539 ASSERT3U(asize, ==, tot << unit_shift);
540 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
541 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
542 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
543 ASSERT3U(rm->rm_nskip, <=, nparity);
546 for (c = 0; c < rm->rm_firstdatacol; c++) {
547 rm->rm_col[c].rc_data =
548 zio_buf_alloc(rm->rm_col[c].rc_size);
551 rm->rm_col[c].rc_data = data;
553 for (c = c + 1; c < acols; c++) {
554 rm->rm_col[c].rc_data =
555 (char *)rm->rm_col[c - 1].rc_data +
556 rm->rm_col[c - 1].rc_size;
561 * If all data stored spans all columns, there's a danger that parity
562 * will always be on the same device and, since parity isn't read
563 * during normal operation, that that device's I/O bandwidth won't be
564 * used effectively. We therefore switch the parity every 1MB.
566 * ... at least that was, ostensibly, the theory. As a practical
567 * matter unless we juggle the parity between all devices evenly, we
568 * won't see any benefit. Further, occasional writes that aren't a
569 * multiple of the LCM of the number of children and the minimum
570 * stripe width are sufficient to avoid pessimal behavior.
571 * Unfortunately, this decision created an implicit on-disk format
572 * requirement that we need to support for all eternity, but only
573 * for single-parity RAID-Z.
575 * If we intend to skip a sector in the zeroth column for padding
576 * we must make sure to note this swap. We will never intend to
577 * skip the first column since at least one data and one parity
578 * column must appear in each row.
580 ASSERT(rm->rm_cols >= 2);
581 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
583 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
584 devidx = rm->rm_col[0].rc_devidx;
585 o = rm->rm_col[0].rc_offset;
586 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
587 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
588 rm->rm_col[1].rc_devidx = devidx;
589 rm->rm_col[1].rc_offset = o;
591 if (rm->rm_skipstart == 0)
592 rm->rm_skipstart = 1;
599 vdev_raidz_generate_parity_p(raidz_map_t *rm)
601 uint64_t *p, *src, pcount, ccount, i;
604 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
606 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
607 src = rm->rm_col[c].rc_data;
608 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
609 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
611 if (c == rm->rm_firstdatacol) {
612 ASSERT(ccount == pcount);
613 for (i = 0; i < ccount; i++, src++, p++) {
617 ASSERT(ccount <= pcount);
618 for (i = 0; i < ccount; i++, src++, p++) {
626 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
628 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
631 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
632 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
633 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
635 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
636 src = rm->rm_col[c].rc_data;
637 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
638 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
640 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
642 if (c == rm->rm_firstdatacol) {
643 ASSERT(ccnt == pcnt || ccnt == 0);
644 for (i = 0; i < ccnt; i++, src++, p++, q++) {
648 for (; i < pcnt; i++, src++, p++, q++) {
653 ASSERT(ccnt <= pcnt);
656 * Apply the algorithm described above by multiplying
657 * the previous result and adding in the new value.
659 for (i = 0; i < ccnt; i++, src++, p++, q++) {
662 VDEV_RAIDZ_64MUL_2(*q, mask);
667 * Treat short columns as though they are full of 0s.
668 * Note that there's therefore nothing needed for P.
670 for (; i < pcnt; i++, q++) {
671 VDEV_RAIDZ_64MUL_2(*q, mask);
678 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
680 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
683 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
684 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
685 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
686 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
687 rm->rm_col[VDEV_RAIDZ_R].rc_size);
689 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
690 src = rm->rm_col[c].rc_data;
691 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
692 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
693 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
695 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
697 if (c == rm->rm_firstdatacol) {
698 ASSERT(ccnt == pcnt || ccnt == 0);
699 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
704 for (; i < pcnt; i++, src++, p++, q++, r++) {
710 ASSERT(ccnt <= pcnt);
713 * Apply the algorithm described above by multiplying
714 * the previous result and adding in the new value.
716 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
719 VDEV_RAIDZ_64MUL_2(*q, mask);
722 VDEV_RAIDZ_64MUL_4(*r, mask);
727 * Treat short columns as though they are full of 0s.
728 * Note that there's therefore nothing needed for P.
730 for (; i < pcnt; i++, q++, r++) {
731 VDEV_RAIDZ_64MUL_2(*q, mask);
732 VDEV_RAIDZ_64MUL_4(*r, mask);
739 * Generate RAID parity in the first virtual columns according to the number of
740 * parity columns available.
743 vdev_raidz_generate_parity(raidz_map_t *rm)
745 switch (rm->rm_firstdatacol) {
747 vdev_raidz_generate_parity_p(rm);
750 vdev_raidz_generate_parity_pq(rm);
753 vdev_raidz_generate_parity_pqr(rm);
756 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
761 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
763 uint64_t *dst, *src, xcount, ccount, count, i;
768 ASSERT(x >= rm->rm_firstdatacol);
769 ASSERT(x < rm->rm_cols);
771 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
772 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
775 src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
776 dst = rm->rm_col[x].rc_data;
777 for (i = 0; i < xcount; i++, dst++, src++) {
781 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
782 src = rm->rm_col[c].rc_data;
783 dst = rm->rm_col[x].rc_data;
788 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
789 count = MIN(ccount, xcount);
791 for (i = 0; i < count; i++, dst++, src++) {
796 return (1 << VDEV_RAIDZ_P);
800 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
802 uint64_t *dst, *src, xcount, ccount, count, mask, i;
809 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
810 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
812 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
813 src = rm->rm_col[c].rc_data;
814 dst = rm->rm_col[x].rc_data;
819 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
821 count = MIN(ccount, xcount);
823 if (c == rm->rm_firstdatacol) {
824 for (i = 0; i < count; i++, dst++, src++) {
827 for (; i < xcount; i++, dst++) {
832 for (i = 0; i < count; i++, dst++, src++) {
833 VDEV_RAIDZ_64MUL_2(*dst, mask);
837 for (; i < xcount; i++, dst++) {
838 VDEV_RAIDZ_64MUL_2(*dst, mask);
843 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
844 dst = rm->rm_col[x].rc_data;
845 exp = 255 - (rm->rm_cols - 1 - x);
847 for (i = 0; i < xcount; i++, dst++, src++) {
849 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
850 *b = vdev_raidz_exp2(*b, exp);
854 return (1 << VDEV_RAIDZ_Q);
858 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
860 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
862 uint64_t xsize, ysize, i;
868 ASSERT(x >= rm->rm_firstdatacol);
869 ASSERT(y < rm->rm_cols);
871 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
874 * Move the parity data aside -- we're going to compute parity as
875 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
876 * reuse the parity generation mechanism without trashing the actual
877 * parity so we make those columns appear to be full of zeros by
878 * setting their lengths to zero.
880 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
881 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
882 xsize = rm->rm_col[x].rc_size;
883 ysize = rm->rm_col[y].rc_size;
885 rm->rm_col[VDEV_RAIDZ_P].rc_data =
886 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
887 rm->rm_col[VDEV_RAIDZ_Q].rc_data =
888 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
889 rm->rm_col[x].rc_size = 0;
890 rm->rm_col[y].rc_size = 0;
892 vdev_raidz_generate_parity_pq(rm);
894 rm->rm_col[x].rc_size = xsize;
895 rm->rm_col[y].rc_size = ysize;
899 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
900 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
901 xd = rm->rm_col[x].rc_data;
902 yd = rm->rm_col[y].rc_data;
906 * Pxy = P + D_x + D_y
907 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
909 * We can then solve for D_x:
910 * D_x = A * (P + Pxy) + B * (Q + Qxy)
912 * A = 2^(x - y) * (2^(x - y) + 1)^-1
913 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
915 * With D_x in hand, we can easily solve for D_y:
916 * D_y = P + Pxy + D_x
919 a = vdev_raidz_pow2[255 + x - y];
920 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
921 tmp = 255 - vdev_raidz_log2[a ^ 1];
923 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
924 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
926 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
927 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
928 vdev_raidz_exp2(*q ^ *qxy, bexp);
931 *yd = *p ^ *pxy ^ *xd;
934 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
935 rm->rm_col[VDEV_RAIDZ_P].rc_size);
936 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
937 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
940 * Restore the saved parity data.
942 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
943 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
945 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
950 * In the general case of reconstruction, we must solve the system of linear
951 * equations defined by the coeffecients used to generate parity as well as
952 * the contents of the data and parity disks. This can be expressed with
953 * vectors for the original data (D) and the actual data (d) and parity (p)
954 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
958 * | V | | D_0 | | p_m-1 |
959 * | | x | : | = | d_0 |
960 * | I | | D_n-1 | | : |
961 * | | ~~ ~~ | d_n-1 |
964 * I is simply a square identity matrix of size n, and V is a vandermonde
965 * matrix defined by the coeffecients we chose for the various parity columns
966 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
967 * computation as well as linear separability.
970 * | 1 .. 1 1 1 | | p_0 |
971 * | 2^n-1 .. 4 2 1 | __ __ | : |
972 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
973 * | 1 .. 0 0 0 | | D_1 | | d_0 |
974 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
975 * | : : : : | | : | | d_2 |
976 * | 0 .. 1 0 0 | | D_n-1 | | : |
977 * | 0 .. 0 1 0 | ~~ ~~ | : |
978 * | 0 .. 0 0 1 | | d_n-1 |
981 * Note that I, V, d, and p are known. To compute D, we must invert the
982 * matrix and use the known data and parity values to reconstruct the unknown
983 * data values. We begin by removing the rows in V|I and d|p that correspond
984 * to failed or missing columns; we then make V|I square (n x n) and d|p
985 * sized n by removing rows corresponding to unused parity from the bottom up
986 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
987 * using Gauss-Jordan elimination. In the example below we use m=3 parity
988 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
990 * | 1 1 1 1 1 1 1 1 |
991 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
992 * | 19 205 116 29 64 16 4 1 | / /
993 * | 1 0 0 0 0 0 0 0 | / /
994 * | 0 1 0 0 0 0 0 0 | <--' /
995 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
996 * | 0 0 0 1 0 0 0 0 |
997 * | 0 0 0 0 1 0 0 0 |
998 * | 0 0 0 0 0 1 0 0 |
999 * | 0 0 0 0 0 0 1 0 |
1000 * | 0 0 0 0 0 0 0 1 |
1003 * | 1 1 1 1 1 1 1 1 |
1004 * | 19 205 116 29 64 16 4 1 |
1005 * | 1 0 0 0 0 0 0 0 |
1006 * (V|I)' = | 0 0 0 1 0 0 0 0 |
1007 * | 0 0 0 0 1 0 0 0 |
1008 * | 0 0 0 0 0 1 0 0 |
1009 * | 0 0 0 0 0 0 1 0 |
1010 * | 0 0 0 0 0 0 0 1 |
1013 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1014 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1015 * matrix is not singular.
1017 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1018 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1019 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1020 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1021 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1022 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1023 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1024 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1027 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1028 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1029 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1030 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1031 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1032 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1033 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1034 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1037 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1038 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1039 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1040 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1041 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1042 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1043 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1044 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1047 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1048 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1049 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1050 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1051 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1052 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1053 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1054 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1057 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1058 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1059 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1060 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1061 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1062 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1063 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1064 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1067 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1068 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1069 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1070 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1071 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1072 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1073 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1074 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1077 * | 0 0 1 0 0 0 0 0 |
1078 * | 167 100 5 41 159 169 217 208 |
1079 * | 166 100 4 40 158 168 216 209 |
1080 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1081 * | 0 0 0 0 1 0 0 0 |
1082 * | 0 0 0 0 0 1 0 0 |
1083 * | 0 0 0 0 0 0 1 0 |
1084 * | 0 0 0 0 0 0 0 1 |
1087 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1088 * of the missing data.
1090 * As is apparent from the example above, the only non-trivial rows in the
1091 * inverse matrix correspond to the data disks that we're trying to
1092 * reconstruct. Indeed, those are the only rows we need as the others would
1093 * only be useful for reconstructing data known or assumed to be valid. For
1094 * that reason, we only build the coefficients in the rows that correspond to
1100 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1106 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1109 * Fill in the missing rows of interest.
1111 for (i = 0; i < nmap; i++) {
1112 ASSERT3S(0, <=, map[i]);
1113 ASSERT3S(map[i], <=, 2);
1120 for (j = 0; j < n; j++) {
1124 rows[i][j] = vdev_raidz_pow2[pow];
1130 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1131 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1137 * Assert that the first nmissing entries from the array of used
1138 * columns correspond to parity columns and that subsequent entries
1139 * correspond to data columns.
1141 for (i = 0; i < nmissing; i++) {
1142 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1144 for (; i < n; i++) {
1145 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1149 * First initialize the storage where we'll compute the inverse rows.
1151 for (i = 0; i < nmissing; i++) {
1152 for (j = 0; j < n; j++) {
1153 invrows[i][j] = (i == j) ? 1 : 0;
1158 * Subtract all trivial rows from the rows of consequence.
1160 for (i = 0; i < nmissing; i++) {
1161 for (j = nmissing; j < n; j++) {
1162 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1163 jj = used[j] - rm->rm_firstdatacol;
1165 invrows[i][j] = rows[i][jj];
1171 * For each of the rows of interest, we must normalize it and subtract
1172 * a multiple of it from the other rows.
1174 for (i = 0; i < nmissing; i++) {
1175 for (j = 0; j < missing[i]; j++) {
1176 ASSERT0(rows[i][j]);
1178 ASSERT3U(rows[i][missing[i]], !=, 0);
1181 * Compute the inverse of the first element and multiply each
1182 * element in the row by that value.
1184 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1186 for (j = 0; j < n; j++) {
1187 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1188 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1191 for (ii = 0; ii < nmissing; ii++) {
1195 ASSERT3U(rows[ii][missing[i]], !=, 0);
1197 log = vdev_raidz_log2[rows[ii][missing[i]]];
1199 for (j = 0; j < n; j++) {
1201 vdev_raidz_exp2(rows[i][j], log);
1203 vdev_raidz_exp2(invrows[i][j], log);
1209 * Verify that the data that is left in the rows are properly part of
1210 * an identity matrix.
1212 for (i = 0; i < nmissing; i++) {
1213 for (j = 0; j < n; j++) {
1214 if (j == missing[i]) {
1215 ASSERT3U(rows[i][j], ==, 1);
1217 ASSERT0(rows[i][j]);
1224 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1225 int *missing, uint8_t **invrows, const uint8_t *used)
1230 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1231 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1235 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1239 psize = sizeof (invlog[0][0]) * n * nmissing;
1240 p = kmem_alloc(psize, KM_SLEEP);
1242 for (pp = p, i = 0; i < nmissing; i++) {
1247 for (i = 0; i < nmissing; i++) {
1248 for (j = 0; j < n; j++) {
1249 ASSERT3U(invrows[i][j], !=, 0);
1250 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1254 for (i = 0; i < n; i++) {
1256 ASSERT3U(c, <, rm->rm_cols);
1258 src = rm->rm_col[c].rc_data;
1259 ccount = rm->rm_col[c].rc_size;
1260 for (j = 0; j < nmissing; j++) {
1261 cc = missing[j] + rm->rm_firstdatacol;
1262 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1263 ASSERT3U(cc, <, rm->rm_cols);
1264 ASSERT3U(cc, !=, c);
1266 dst[j] = rm->rm_col[cc].rc_data;
1267 dcount[j] = rm->rm_col[cc].rc_size;
1270 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1272 for (x = 0; x < ccount; x++, src++) {
1274 log = vdev_raidz_log2[*src];
1276 for (cc = 0; cc < nmissing; cc++) {
1277 if (x >= dcount[cc])
1283 if ((ll = log + invlog[cc][i]) >= 255)
1285 val = vdev_raidz_pow2[ll];
1296 kmem_free(p, psize);
1300 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1304 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1305 int parity_map[VDEV_RAIDZ_MAXPARITY];
1310 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1311 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1317 n = rm->rm_cols - rm->rm_firstdatacol;
1320 * Figure out which data columns are missing.
1323 for (t = 0; t < ntgts; t++) {
1324 if (tgts[t] >= rm->rm_firstdatacol) {
1325 missing_rows[nmissing_rows++] =
1326 tgts[t] - rm->rm_firstdatacol;
1331 * Figure out which parity columns to use to help generate the missing
1334 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1336 ASSERT(c < rm->rm_firstdatacol);
1339 * Skip any targeted parity columns.
1341 if (c == tgts[tt]) {
1353 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1355 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1356 nmissing_rows * n + sizeof (used[0]) * n;
1357 p = kmem_alloc(psize, KM_SLEEP);
1359 for (pp = p, i = 0; i < nmissing_rows; i++) {
1367 for (i = 0; i < nmissing_rows; i++) {
1368 used[i] = parity_map[i];
1371 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1372 if (tt < nmissing_rows &&
1373 c == missing_rows[tt] + rm->rm_firstdatacol) {
1384 * Initialize the interesting rows of the matrix.
1386 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1389 * Invert the matrix.
1391 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1395 * Reconstruct the missing data using the generated matrix.
1397 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1400 kmem_free(p, psize);
1406 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1408 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1412 int nbadparity, nbaddata;
1413 int parity_valid[VDEV_RAIDZ_MAXPARITY];
1416 * The tgts list must already be sorted.
1418 for (i = 1; i < nt; i++) {
1419 ASSERT(t[i] > t[i - 1]);
1422 nbadparity = rm->rm_firstdatacol;
1423 nbaddata = rm->rm_cols - nbadparity;
1425 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1426 if (c < rm->rm_firstdatacol)
1427 parity_valid[c] = B_FALSE;
1429 if (i < nt && c == t[i]) {
1432 } else if (rm->rm_col[c].rc_error != 0) {
1434 } else if (c >= rm->rm_firstdatacol) {
1437 parity_valid[c] = B_TRUE;
1442 ASSERT(ntgts >= nt);
1443 ASSERT(nbaddata >= 0);
1444 ASSERT(nbaddata + nbadparity == ntgts);
1446 dt = &tgts[nbadparity];
1449 * See if we can use any of our optimized reconstruction routines.
1451 if (!vdev_raidz_default_to_general) {
1454 if (parity_valid[VDEV_RAIDZ_P])
1455 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1457 ASSERT(rm->rm_firstdatacol > 1);
1459 if (parity_valid[VDEV_RAIDZ_Q])
1460 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1462 ASSERT(rm->rm_firstdatacol > 2);
1466 ASSERT(rm->rm_firstdatacol > 1);
1468 if (parity_valid[VDEV_RAIDZ_P] &&
1469 parity_valid[VDEV_RAIDZ_Q])
1470 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1472 ASSERT(rm->rm_firstdatacol > 2);
1478 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1479 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1485 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1486 uint64_t *logical_ashift, uint64_t *physical_ashift)
1489 uint64_t nparity = vd->vdev_nparity;
1494 ASSERT(nparity > 0);
1496 if (nparity > VDEV_RAIDZ_MAXPARITY ||
1497 vd->vdev_children < nparity + 1) {
1498 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1499 return (SET_ERROR(EINVAL));
1502 vdev_open_children(vd);
1504 for (c = 0; c < vd->vdev_children; c++) {
1505 cvd = vd->vdev_child[c];
1507 if (cvd->vdev_open_error != 0) {
1508 lasterror = cvd->vdev_open_error;
1513 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1514 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1515 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1516 *physical_ashift = MAX(*physical_ashift,
1517 cvd->vdev_physical_ashift);
1520 *asize *= vd->vdev_children;
1521 *max_asize *= vd->vdev_children;
1523 if (numerrors > nparity) {
1524 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1532 vdev_raidz_close(vdev_t *vd)
1536 for (c = 0; c < vd->vdev_children; c++)
1537 vdev_close(vd->vdev_child[c]);
1542 * Handle a read or write I/O to a RAID-Z dump device.
1544 * The dump device is in a unique situation compared to other ZFS datasets:
1545 * writing to this device should be as simple and fast as possible. In
1546 * addition, durability matters much less since the dump will be extracted
1547 * once the machine reboots. For that reason, this function eschews parity for
1548 * performance and simplicity. The dump device uses the checksum setting
1549 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1552 * Blocks of size 128 KB have been preallocated for this volume. I/Os less than
1553 * 128 KB will not fill an entire block; in addition, they may not be properly
1554 * aligned. In that case, this function uses the preallocated 128 KB block and
1555 * omits reading or writing any "empty" portions of that block, as opposed to
1556 * allocating a fresh appropriately-sized block.
1558 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1560 * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1562 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1563 * allocated which spans all five child vdevs. 8 KB of data would be written to
1564 * each of four vdevs, with the fifth containing the parity bits.
1566 * parity data data data data
1567 * | PP | XX | XX | XX | XX |
1570 * 8 KB parity ------8 KB data blocks------
1572 * However, when writing to the dump device, the behavior is different:
1574 * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1576 * Unlike the normal RAID-Z case in which the block is allocated based on the
1577 * I/O size, reads and writes here always use a 128 KB logical I/O size. If the
1578 * I/O size is less than 128 KB, only the actual portions of data are written.
1579 * In this example the data is written to the third data vdev since that vdev
1580 * contains the offset [64 KB, 96 KB).
1582 * parity data data data data
1588 * As a result, an individual I/O may not span all child vdevs; moreover, a
1589 * small I/O may only operate on a single child vdev.
1591 * Note that since there are no parity bits calculated or written, this format
1592 * remains the same no matter how many parity bits are used in a normal RAID-Z
1593 * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above
1596 * parity parity parity data data data data
1597 * | | | | | | XX | |
1603 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1604 uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1606 vdev_t *tvd = vd->vdev_top;
1612 uint64_t start, end, colstart, colend;
1613 uint64_t coloffset, colsize, colskip;
1615 int flags = doread ? BIO_READ : BIO_WRITE;
1620 * Don't write past the end of the block
1622 VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1628 * Allocate a RAID-Z map for this block. Note that this block starts
1629 * from the "original" offset, this is, the offset of the extent which
1630 * contains the requisite offset of the data being read or written.
1632 * Even if this I/O operation doesn't span the full block size, let's
1633 * treat the on-disk format as if the only blocks are the complete 128
1636 rm = vdev_raidz_map_alloc(data - (offset - origoffset),
1637 SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
1638 vd->vdev_children, vd->vdev_nparity);
1640 coloffset = origoffset;
1642 for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1643 c++, coloffset += rc->rc_size) {
1644 rc = &rm->rm_col[c];
1645 cvd = vd->vdev_child[rc->rc_devidx];
1648 * Find the start and end of this column in the RAID-Z map,
1649 * keeping in mind that the stated size and offset of the
1650 * operation may not fill the entire column for this vdev.
1652 * If any portion of the data spans this column, issue the
1653 * appropriate operation to the vdev.
1655 if (coloffset + rc->rc_size <= start)
1657 if (coloffset >= end)
1660 colstart = MAX(coloffset, start);
1661 colend = MIN(end, coloffset + rc->rc_size);
1662 colsize = colend - colstart;
1663 colskip = colstart - coloffset;
1665 VERIFY3U(colsize, <=, rc->rc_size);
1666 VERIFY3U(colskip, <=, rc->rc_size);
1669 * Note that the child vdev will have a vdev label at the start
1670 * of its range of offsets, hence the need for
1671 * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another
1672 * example of why this calculation is needed.
1674 if ((err = vdev_disk_physio(cvd,
1675 ((char *)rc->rc_data) + colskip, colsize,
1676 VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1677 flags, isdump)) != 0)
1681 vdev_raidz_map_free(rm);
1689 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1692 uint64_t ashift = vd->vdev_top->vdev_ashift;
1693 uint64_t cols = vd->vdev_children;
1694 uint64_t nparity = vd->vdev_nparity;
1696 asize = ((psize - 1) >> ashift) + 1;
1697 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1698 asize = roundup(asize, nparity + 1) << ashift;
1704 vdev_raidz_child_done(zio_t *zio)
1706 raidz_col_t *rc = zio->io_private;
1708 rc->rc_error = zio->io_error;
1714 * Start an IO operation on a RAIDZ VDev
1717 * - For write operations:
1718 * 1. Generate the parity data
1719 * 2. Create child zio write operations to each column's vdev, for both
1721 * 3. If the column skips any sectors for padding, create optional dummy
1722 * write zio children for those areas to improve aggregation continuity.
1723 * - For read operations:
1724 * 1. Create child zio read operations to each data column's vdev to read
1725 * the range of data required for zio.
1726 * 2. If this is a scrub or resilver operation, or if any of the data
1727 * vdevs have had errors, then create zio read operations to the parity
1728 * columns' VDevs as well.
1731 vdev_raidz_io_start(zio_t *zio)
1733 vdev_t *vd = zio->io_vd;
1734 vdev_t *tvd = vd->vdev_top;
1740 rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
1741 zio->io_type == ZIO_TYPE_FREE,
1742 tvd->vdev_ashift, vd->vdev_children,
1746 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1748 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1750 if (zio->io_type == ZIO_TYPE_FREE) {
1751 for (c = 0; c < rm->rm_cols; c++) {
1752 rc = &rm->rm_col[c];
1753 cvd = vd->vdev_child[rc->rc_devidx];
1754 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1755 rc->rc_offset, rc->rc_data, rc->rc_size,
1756 zio->io_type, zio->io_priority, 0,
1757 vdev_raidz_child_done, rc));
1764 if (zio->io_type == ZIO_TYPE_WRITE) {
1765 vdev_raidz_generate_parity(rm);
1767 for (c = 0; c < rm->rm_cols; c++) {
1768 rc = &rm->rm_col[c];
1769 cvd = vd->vdev_child[rc->rc_devidx];
1770 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1771 rc->rc_offset, rc->rc_data, rc->rc_size,
1772 zio->io_type, zio->io_priority, 0,
1773 vdev_raidz_child_done, rc));
1777 * Generate optional I/Os for any skipped sectors to improve
1778 * aggregation contiguity.
1780 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1781 ASSERT(c <= rm->rm_scols);
1782 if (c == rm->rm_scols)
1784 rc = &rm->rm_col[c];
1785 cvd = vd->vdev_child[rc->rc_devidx];
1786 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1787 rc->rc_offset + rc->rc_size, NULL,
1788 1 << tvd->vdev_ashift,
1789 zio->io_type, zio->io_priority,
1790 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1797 ASSERT(zio->io_type == ZIO_TYPE_READ);
1800 * Iterate over the columns in reverse order so that we hit the parity
1801 * last -- any errors along the way will force us to read the parity.
1803 for (c = rm->rm_cols - 1; c >= 0; c--) {
1804 rc = &rm->rm_col[c];
1805 cvd = vd->vdev_child[rc->rc_devidx];
1806 if (!vdev_readable(cvd)) {
1807 if (c >= rm->rm_firstdatacol)
1808 rm->rm_missingdata++;
1810 rm->rm_missingparity++;
1811 rc->rc_error = SET_ERROR(ENXIO);
1812 rc->rc_tried = 1; /* don't even try */
1816 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1817 if (c >= rm->rm_firstdatacol)
1818 rm->rm_missingdata++;
1820 rm->rm_missingparity++;
1821 rc->rc_error = SET_ERROR(ESTALE);
1825 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1826 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1827 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1828 rc->rc_offset, rc->rc_data, rc->rc_size,
1829 zio->io_type, zio->io_priority, 0,
1830 vdev_raidz_child_done, rc));
1839 * Report a checksum error for a child of a RAID-Z device.
1842 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1844 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1846 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1847 zio_bad_cksum_t zbc;
1848 raidz_map_t *rm = zio->io_vsd;
1850 mutex_enter(&vd->vdev_stat_lock);
1851 vd->vdev_stat.vs_checksum_errors++;
1852 mutex_exit(&vd->vdev_stat_lock);
1854 zbc.zbc_has_cksum = 0;
1855 zbc.zbc_injected = rm->rm_ecksuminjected;
1857 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1858 rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1864 * We keep track of whether or not there were any injected errors, so that
1865 * any ereports we generate can note it.
1868 raidz_checksum_verify(zio_t *zio)
1870 zio_bad_cksum_t zbc;
1871 raidz_map_t *rm = zio->io_vsd;
1873 int ret = zio_checksum_error(zio, &zbc);
1874 if (ret != 0 && zbc.zbc_injected != 0)
1875 rm->rm_ecksuminjected = 1;
1881 * Generate the parity from the data columns. If we tried and were able to
1882 * read the parity without error, verify that the generated parity matches the
1883 * data we read. If it doesn't, we fire off a checksum error. Return the
1884 * number such failures.
1887 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1889 void *orig[VDEV_RAIDZ_MAXPARITY];
1893 blkptr_t *bp = zio->io_bp;
1894 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1895 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1897 if (checksum == ZIO_CHECKSUM_NOPARITY)
1900 for (c = 0; c < rm->rm_firstdatacol; c++) {
1901 rc = &rm->rm_col[c];
1902 if (!rc->rc_tried || rc->rc_error != 0)
1904 orig[c] = zio_buf_alloc(rc->rc_size);
1905 bcopy(rc->rc_data, orig[c], rc->rc_size);
1908 vdev_raidz_generate_parity(rm);
1910 for (c = 0; c < rm->rm_firstdatacol; c++) {
1911 rc = &rm->rm_col[c];
1912 if (!rc->rc_tried || rc->rc_error != 0)
1914 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1915 raidz_checksum_error(zio, rc, orig[c]);
1916 rc->rc_error = SET_ERROR(ECKSUM);
1919 zio_buf_free(orig[c], rc->rc_size);
1926 * Keep statistics on all the ways that we used parity to correct data.
1928 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1931 vdev_raidz_worst_error(raidz_map_t *rm)
1935 for (int c = 0; c < rm->rm_cols; c++)
1936 error = zio_worst_error(error, rm->rm_col[c].rc_error);
1942 * Iterate over all combinations of bad data and attempt a reconstruction.
1943 * Note that the algorithm below is non-optimal because it doesn't take into
1944 * account how reconstruction is actually performed. For example, with
1945 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1946 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1947 * cases we'd only use parity information in column 0.
1950 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1952 raidz_map_t *rm = zio->io_vsd;
1954 void *orig[VDEV_RAIDZ_MAXPARITY];
1955 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1956 int *tgts = &tstore[1];
1957 int current, next, i, c, n;
1960 ASSERT(total_errors < rm->rm_firstdatacol);
1963 * This simplifies one edge condition.
1967 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1969 * Initialize the targets array by finding the first n columns
1970 * that contain no error.
1972 * If there were no data errors, we need to ensure that we're
1973 * always explicitly attempting to reconstruct at least one
1974 * data column. To do this, we simply push the highest target
1975 * up into the data columns.
1977 for (c = 0, i = 0; i < n; i++) {
1978 if (i == n - 1 && data_errors == 0 &&
1979 c < rm->rm_firstdatacol) {
1980 c = rm->rm_firstdatacol;
1983 while (rm->rm_col[c].rc_error != 0) {
1985 ASSERT3S(c, <, rm->rm_cols);
1992 * Setting tgts[n] simplifies the other edge condition.
1994 tgts[n] = rm->rm_cols;
1997 * These buffers were allocated in previous iterations.
1999 for (i = 0; i < n - 1; i++) {
2000 ASSERT(orig[i] != NULL);
2003 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2006 next = tgts[current];
2008 while (current != n) {
2009 tgts[current] = next;
2013 * Save off the original data that we're going to
2014 * attempt to reconstruct.
2016 for (i = 0; i < n; i++) {
2017 ASSERT(orig[i] != NULL);
2020 ASSERT3S(c, <, rm->rm_cols);
2021 rc = &rm->rm_col[c];
2022 bcopy(rc->rc_data, orig[i], rc->rc_size);
2026 * Attempt a reconstruction and exit the outer loop on
2029 code = vdev_raidz_reconstruct(rm, tgts, n);
2030 if (raidz_checksum_verify(zio) == 0) {
2031 atomic_inc_64(&raidz_corrected[code]);
2033 for (i = 0; i < n; i++) {
2035 rc = &rm->rm_col[c];
2036 ASSERT(rc->rc_error == 0);
2038 raidz_checksum_error(zio, rc,
2040 rc->rc_error = SET_ERROR(ECKSUM);
2048 * Restore the original data.
2050 for (i = 0; i < n; i++) {
2052 rc = &rm->rm_col[c];
2053 bcopy(orig[i], rc->rc_data, rc->rc_size);
2058 * Find the next valid column after the current
2061 for (next = tgts[current] + 1;
2062 next < rm->rm_cols &&
2063 rm->rm_col[next].rc_error != 0; next++)
2066 ASSERT(next <= tgts[current + 1]);
2069 * If that spot is available, we're done here.
2071 if (next != tgts[current + 1])
2075 * Otherwise, find the next valid column after
2076 * the previous position.
2078 for (c = tgts[current - 1] + 1;
2079 rm->rm_col[c].rc_error != 0; c++)
2085 } while (current != n);
2090 for (i = 0; i < n; i++) {
2091 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2098 * Complete an IO operation on a RAIDZ VDev
2101 * - For write operations:
2102 * 1. Check for errors on the child IOs.
2103 * 2. Return, setting an error code if too few child VDevs were written
2104 * to reconstruct the data later. Note that partial writes are
2105 * considered successful if they can be reconstructed at all.
2106 * - For read operations:
2107 * 1. Check for errors on the child IOs.
2108 * 2. If data errors occurred:
2109 * a. Try to reassemble the data from the parity available.
2110 * b. If we haven't yet read the parity drives, read them now.
2111 * c. If all parity drives have been read but the data still doesn't
2112 * reassemble with a correct checksum, then try combinatorial
2114 * d. If that doesn't work, return an error.
2115 * 3. If there were unexpected errors or this is a resilver operation,
2116 * rewrite the vdevs that had errors.
2119 vdev_raidz_io_done(zio_t *zio)
2121 vdev_t *vd = zio->io_vd;
2123 raidz_map_t *rm = zio->io_vsd;
2125 int unexpected_errors = 0;
2126 int parity_errors = 0;
2127 int parity_untried = 0;
2128 int data_errors = 0;
2129 int total_errors = 0;
2131 int tgts[VDEV_RAIDZ_MAXPARITY];
2134 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
2136 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2137 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2139 for (c = 0; c < rm->rm_cols; c++) {
2140 rc = &rm->rm_col[c];
2143 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
2145 if (c < rm->rm_firstdatacol)
2150 if (!rc->rc_skipped)
2151 unexpected_errors++;
2154 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2159 if (zio->io_type == ZIO_TYPE_WRITE) {
2161 * XXX -- for now, treat partial writes as a success.
2162 * (If we couldn't write enough columns to reconstruct
2163 * the data, the I/O failed. Otherwise, good enough.)
2165 * Now that we support write reallocation, it would be better
2166 * to treat partial failure as real failure unless there are
2167 * no non-degraded top-level vdevs left, and not update DTLs
2168 * if we intend to reallocate.
2171 if (total_errors > rm->rm_firstdatacol)
2172 zio->io_error = vdev_raidz_worst_error(rm);
2175 } else if (zio->io_type == ZIO_TYPE_FREE) {
2179 ASSERT(zio->io_type == ZIO_TYPE_READ);
2181 * There are three potential phases for a read:
2182 * 1. produce valid data from the columns read
2183 * 2. read all disks and try again
2184 * 3. perform combinatorial reconstruction
2186 * Each phase is progressively both more expensive and less likely to
2187 * occur. If we encounter more errors than we can repair or all phases
2188 * fail, we have no choice but to return an error.
2192 * If the number of errors we saw was correctable -- less than or equal
2193 * to the number of parity disks read -- attempt to produce data that
2194 * has a valid checksum. Naturally, this case applies in the absence of
2197 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2198 if (data_errors == 0) {
2199 if (raidz_checksum_verify(zio) == 0) {
2201 * If we read parity information (unnecessarily
2202 * as it happens since no reconstruction was
2203 * needed) regenerate and verify the parity.
2204 * We also regenerate parity when resilvering
2205 * so we can write it out to the failed device
2208 if (parity_errors + parity_untried <
2209 rm->rm_firstdatacol ||
2210 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2211 n = raidz_parity_verify(zio, rm);
2212 unexpected_errors += n;
2213 ASSERT(parity_errors + n <=
2214 rm->rm_firstdatacol);
2220 * We either attempt to read all the parity columns or
2221 * none of them. If we didn't try to read parity, we
2222 * wouldn't be here in the correctable case. There must
2223 * also have been fewer parity errors than parity
2224 * columns or, again, we wouldn't be in this code path.
2226 ASSERT(parity_untried == 0);
2227 ASSERT(parity_errors < rm->rm_firstdatacol);
2230 * Identify the data columns that reported an error.
2233 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2234 rc = &rm->rm_col[c];
2235 if (rc->rc_error != 0) {
2236 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2241 ASSERT(rm->rm_firstdatacol >= n);
2243 code = vdev_raidz_reconstruct(rm, tgts, n);
2245 if (raidz_checksum_verify(zio) == 0) {
2246 atomic_inc_64(&raidz_corrected[code]);
2249 * If we read more parity disks than were used
2250 * for reconstruction, confirm that the other
2251 * parity disks produced correct data. This
2252 * routine is suboptimal in that it regenerates
2253 * the parity that we already used in addition
2254 * to the parity that we're attempting to
2255 * verify, but this should be a relatively
2256 * uncommon case, and can be optimized if it
2257 * becomes a problem. Note that we regenerate
2258 * parity when resilvering so we can write it
2259 * out to failed devices later.
2261 if (parity_errors < rm->rm_firstdatacol - n ||
2262 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2263 n = raidz_parity_verify(zio, rm);
2264 unexpected_errors += n;
2265 ASSERT(parity_errors + n <=
2266 rm->rm_firstdatacol);
2275 * This isn't a typical situation -- either we got a read error or
2276 * a child silently returned bad data. Read every block so we can
2277 * try again with as much data and parity as we can track down. If
2278 * we've already been through once before, all children will be marked
2279 * as tried so we'll proceed to combinatorial reconstruction.
2281 unexpected_errors = 1;
2282 rm->rm_missingdata = 0;
2283 rm->rm_missingparity = 0;
2285 for (c = 0; c < rm->rm_cols; c++) {
2286 if (rm->rm_col[c].rc_tried)
2289 zio_vdev_io_redone(zio);
2291 rc = &rm->rm_col[c];
2294 zio_nowait(zio_vdev_child_io(zio, NULL,
2295 vd->vdev_child[rc->rc_devidx],
2296 rc->rc_offset, rc->rc_data, rc->rc_size,
2297 zio->io_type, zio->io_priority, 0,
2298 vdev_raidz_child_done, rc));
2299 } while (++c < rm->rm_cols);
2305 * At this point we've attempted to reconstruct the data given the
2306 * errors we detected, and we've attempted to read all columns. There
2307 * must, therefore, be one or more additional problems -- silent errors
2308 * resulting in invalid data rather than explicit I/O errors resulting
2309 * in absent data. We check if there is enough additional data to
2310 * possibly reconstruct the data and then perform combinatorial
2311 * reconstruction over all possible combinations. If that fails,
2314 if (total_errors > rm->rm_firstdatacol) {
2315 zio->io_error = vdev_raidz_worst_error(rm);
2317 } else if (total_errors < rm->rm_firstdatacol &&
2318 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2320 * If we didn't use all the available parity for the
2321 * combinatorial reconstruction, verify that the remaining
2322 * parity is correct.
2324 if (code != (1 << rm->rm_firstdatacol) - 1)
2325 (void) raidz_parity_verify(zio, rm);
2328 * We're here because either:
2330 * total_errors == rm_first_datacol, or
2331 * vdev_raidz_combrec() failed
2333 * In either case, there is enough bad data to prevent
2336 * Start checksum ereports for all children which haven't
2337 * failed, and the IO wasn't speculative.
2339 zio->io_error = SET_ERROR(ECKSUM);
2341 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2342 for (c = 0; c < rm->rm_cols; c++) {
2343 rc = &rm->rm_col[c];
2344 if (rc->rc_error == 0) {
2345 zio_bad_cksum_t zbc;
2346 zbc.zbc_has_cksum = 0;
2348 rm->rm_ecksuminjected;
2350 zfs_ereport_start_checksum(
2352 vd->vdev_child[rc->rc_devidx],
2353 zio, rc->rc_offset, rc->rc_size,
2354 (void *)(uintptr_t)c, &zbc);
2361 zio_checksum_verified(zio);
2363 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2364 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2366 * Use the good data we have in hand to repair damaged children.
2368 for (c = 0; c < rm->rm_cols; c++) {
2369 rc = &rm->rm_col[c];
2370 cvd = vd->vdev_child[rc->rc_devidx];
2372 if (rc->rc_error == 0)
2375 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2376 rc->rc_offset, rc->rc_data, rc->rc_size,
2377 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2378 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2379 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2385 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2387 if (faulted > vd->vdev_nparity)
2388 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2389 VDEV_AUX_NO_REPLICAS);
2390 else if (degraded + faulted != 0)
2391 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2393 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2396 vdev_ops_t vdev_raidz_ops = {
2400 vdev_raidz_io_start,
2402 vdev_raidz_state_change,
2405 VDEV_TYPE_RAIDZ, /* name of this vdev type */
2406 B_FALSE /* not a leaf vdev */