4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2013 by Delphix. All rights reserved.
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
28 #include <sys/zfs_context.h>
30 #include <sys/vdev_impl.h>
32 #include <sys/vdev_disk.h>
34 #include <sys/vdev_file.h>
35 #include <sys/vdev_raidz.h>
37 #include <sys/zio_checksum.h>
38 #include <sys/fs/zfs.h>
39 #include <sys/fm/fs/zfs.h>
43 * Virtual device vector for RAID-Z.
45 * This vdev supports single, double, and triple parity. For single parity,
46 * we use a simple XOR of all the data columns. For double or triple parity,
47 * we use a special case of Reed-Solomon coding. This extends the
48 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
49 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
50 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
51 * former is also based. The latter is designed to provide higher performance
54 * Note that the Plank paper claimed to support arbitrary N+M, but was then
55 * amended six years later identifying a critical flaw that invalidates its
56 * claims. Nevertheless, the technique can be adapted to work for up to
57 * triple parity. For additional parity, the amendment "Note: Correction to
58 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
59 * is viable, but the additional complexity means that write performance will
62 * All of the methods above operate on a Galois field, defined over the
63 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
64 * can be expressed with a single byte. Briefly, the operations on the
65 * field are defined as follows:
67 * o addition (+) is represented by a bitwise XOR
68 * o subtraction (-) is therefore identical to addition: A + B = A - B
69 * o multiplication of A by 2 is defined by the following bitwise expression:
74 * (A * 2)_4 = A_3 + A_7
75 * (A * 2)_3 = A_2 + A_7
76 * (A * 2)_2 = A_1 + A_7
80 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
81 * As an aside, this multiplication is derived from the error correcting
82 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
84 * Observe that any number in the field (except for 0) can be expressed as a
85 * power of 2 -- a generator for the field. We store a table of the powers of
86 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
87 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
88 * than field addition). The inverse of a field element A (A^-1) is therefore
89 * A ^ (255 - 1) = A^254.
91 * The up-to-three parity columns, P, Q, R over several data columns,
92 * D_0, ... D_n-1, can be expressed by field operations:
94 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
95 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
96 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
97 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
98 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
100 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
101 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
102 * independent coefficients. (There are no additional coefficients that have
103 * this property which is why the uncorrected Plank method breaks down.)
105 * See the reconstruction code below for how P, Q and R can used individually
106 * or in concert to recover missing data columns.
109 typedef struct raidz_col {
110 uint64_t rc_devidx; /* child device index for I/O */
111 uint64_t rc_offset; /* device offset */
112 uint64_t rc_size; /* I/O size */
113 void *rc_data; /* I/O data */
114 void *rc_gdata; /* used to store the "good" version */
115 int rc_error; /* I/O error for this device */
116 uint8_t rc_tried; /* Did we attempt this I/O column? */
117 uint8_t rc_skipped; /* Did we skip this I/O column? */
120 typedef struct raidz_map {
121 uint64_t rm_cols; /* Regular column count */
122 uint64_t rm_scols; /* Count including skipped columns */
123 uint64_t rm_bigcols; /* Number of oversized columns */
124 uint64_t rm_asize; /* Actual total I/O size */
125 uint64_t rm_missingdata; /* Count of missing data devices */
126 uint64_t rm_missingparity; /* Count of missing parity devices */
127 uint64_t rm_firstdatacol; /* First data column/parity count */
128 uint64_t rm_nskip; /* Skipped sectors for padding */
129 uint64_t rm_skipstart; /* Column index of padding start */
130 void *rm_datacopy; /* rm_asize-buffer of copied data */
131 uintptr_t rm_reports; /* # of referencing checksum reports */
132 uint8_t rm_freed; /* map no longer has referencing ZIO */
133 uint8_t rm_ecksuminjected; /* checksum error was injected */
134 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
137 #define VDEV_RAIDZ_P 0
138 #define VDEV_RAIDZ_Q 1
139 #define VDEV_RAIDZ_R 2
141 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
142 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
145 * We provide a mechanism to perform the field multiplication operation on a
146 * 64-bit value all at once rather than a byte at a time. This works by
147 * creating a mask from the top bit in each byte and using that to
148 * conditionally apply the XOR of 0x1d.
150 #define VDEV_RAIDZ_64MUL_2(x, mask) \
152 (mask) = (x) & 0x8080808080808080ULL; \
153 (mask) = ((mask) << 1) - ((mask) >> 7); \
154 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
155 ((mask) & 0x1d1d1d1d1d1d1d1d); \
158 #define VDEV_RAIDZ_64MUL_4(x, mask) \
160 VDEV_RAIDZ_64MUL_2((x), mask); \
161 VDEV_RAIDZ_64MUL_2((x), mask); \
164 #define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE)
167 * Force reconstruction to use the general purpose method.
169 int vdev_raidz_default_to_general;
171 /* Powers of 2 in the Galois field defined above. */
172 static const uint8_t vdev_raidz_pow2[256] = {
173 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
174 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
175 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
176 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
177 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
178 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
179 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
180 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
181 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
182 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
183 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
184 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
185 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
186 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
187 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
188 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
189 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
190 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
191 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
192 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
193 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
194 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
195 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
196 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
197 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
198 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
199 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
200 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
201 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
202 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
203 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
204 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
206 /* Logs of 2 in the Galois field defined above. */
207 static const uint8_t vdev_raidz_log2[256] = {
208 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
209 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
210 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
211 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
212 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
213 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
214 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
215 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
216 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
217 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
218 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
219 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
220 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
221 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
222 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
223 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
224 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
225 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
226 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
227 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
228 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
229 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
230 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
231 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
232 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
233 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
234 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
235 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
236 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
237 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
238 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
239 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
242 static void vdev_raidz_generate_parity(raidz_map_t *rm);
245 * Multiply a given number by 2 raised to the given power.
248 vdev_raidz_exp2(uint_t a, int exp)
254 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
256 exp += vdev_raidz_log2[a];
260 return (vdev_raidz_pow2[exp]);
264 vdev_raidz_map_free(raidz_map_t *rm)
269 for (c = 0; c < rm->rm_firstdatacol; c++) {
270 if (rm->rm_col[c].rc_data != NULL)
271 zio_buf_free(rm->rm_col[c].rc_data,
272 rm->rm_col[c].rc_size);
274 if (rm->rm_col[c].rc_gdata != NULL)
275 zio_buf_free(rm->rm_col[c].rc_gdata,
276 rm->rm_col[c].rc_size);
280 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
281 size += rm->rm_col[c].rc_size;
283 if (rm->rm_datacopy != NULL)
284 zio_buf_free(rm->rm_datacopy, size);
286 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
290 vdev_raidz_map_free_vsd(zio_t *zio)
292 raidz_map_t *rm = zio->io_vsd;
294 ASSERT0(rm->rm_freed);
297 if (rm->rm_reports == 0)
298 vdev_raidz_map_free(rm);
303 vdev_raidz_cksum_free(void *arg, size_t ignored)
305 raidz_map_t *rm = arg;
307 ASSERT3U(rm->rm_reports, >, 0);
309 if (--rm->rm_reports == 0 && rm->rm_freed != 0)
310 vdev_raidz_map_free(rm);
314 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
316 raidz_map_t *rm = zcr->zcr_cbdata;
317 size_t c = zcr->zcr_cbinfo;
320 const char *good = NULL;
321 const char *bad = rm->rm_col[c].rc_data;
323 if (good_data == NULL) {
324 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
328 if (c < rm->rm_firstdatacol) {
330 * The first time through, calculate the parity blocks for
331 * the good data (this relies on the fact that the good
332 * data never changes for a given logical ZIO)
334 if (rm->rm_col[0].rc_gdata == NULL) {
335 char *bad_parity[VDEV_RAIDZ_MAXPARITY];
339 * Set up the rm_col[]s to generate the parity for
340 * good_data, first saving the parity bufs and
341 * replacing them with buffers to hold the result.
343 for (x = 0; x < rm->rm_firstdatacol; x++) {
344 bad_parity[x] = rm->rm_col[x].rc_data;
345 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
346 zio_buf_alloc(rm->rm_col[x].rc_size);
349 /* fill in the data columns from good_data */
350 buf = (char *)good_data;
351 for (; x < rm->rm_cols; x++) {
352 rm->rm_col[x].rc_data = buf;
353 buf += rm->rm_col[x].rc_size;
357 * Construct the parity from the good data.
359 vdev_raidz_generate_parity(rm);
361 /* restore everything back to its original state */
362 for (x = 0; x < rm->rm_firstdatacol; x++)
363 rm->rm_col[x].rc_data = bad_parity[x];
365 buf = rm->rm_datacopy;
366 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
367 rm->rm_col[x].rc_data = buf;
368 buf += rm->rm_col[x].rc_size;
372 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
373 good = rm->rm_col[c].rc_gdata;
375 /* adjust good_data to point at the start of our column */
378 for (x = rm->rm_firstdatacol; x < c; x++)
379 good += rm->rm_col[x].rc_size;
382 /* we drop the ereport if it ends up that the data was good */
383 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
387 * Invoked indirectly by zfs_ereport_start_checksum(), called
388 * below when our read operation fails completely. The main point
389 * is to keep a copy of everything we read from disk, so that at
390 * vdev_raidz_cksum_finish() time we can compare it with the good data.
393 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
395 size_t c = (size_t)(uintptr_t)arg;
398 raidz_map_t *rm = zio->io_vsd;
401 /* set up the report and bump the refcount */
402 zcr->zcr_cbdata = rm;
404 zcr->zcr_finish = vdev_raidz_cksum_finish;
405 zcr->zcr_free = vdev_raidz_cksum_free;
408 ASSERT3U(rm->rm_reports, >, 0);
410 if (rm->rm_datacopy != NULL)
414 * It's the first time we're called for this raidz_map_t, so we need
415 * to copy the data aside; there's no guarantee that our zio's buffer
416 * won't be re-used for something else.
418 * Our parity data is already in separate buffers, so there's no need
423 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
424 size += rm->rm_col[c].rc_size;
426 buf = rm->rm_datacopy = zio_buf_alloc(size);
428 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
429 raidz_col_t *col = &rm->rm_col[c];
431 bcopy(col->rc_data, buf, col->rc_size);
436 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
439 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
440 vdev_raidz_map_free_vsd,
441 vdev_raidz_cksum_report
445 * Divides the IO evenly across all child vdevs; usually, dcols is
446 * the number of children in the target vdev.
449 vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree,
450 uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
453 /* The starting RAIDZ (parent) vdev sector of the block. */
454 uint64_t b = offset >> unit_shift;
455 /* The zio's size in units of the vdev's minimum sector size. */
456 uint64_t s = size >> unit_shift;
457 /* The first column for this stripe. */
458 uint64_t f = b % dcols;
459 /* The starting byte offset on each child vdev. */
460 uint64_t o = (b / dcols) << unit_shift;
461 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
464 * "Quotient": The number of data sectors for this stripe on all but
465 * the "big column" child vdevs that also contain "remainder" data.
467 q = s / (dcols - nparity);
470 * "Remainder": The number of partial stripe data sectors in this I/O.
471 * This will add a sector to some, but not all, child vdevs.
473 r = s - q * (dcols - nparity);
475 /* The number of "big columns" - those which contain remainder data. */
476 bc = (r == 0 ? 0 : r + nparity);
479 * The total number of data and parity sectors associated with
482 tot = s + nparity * (q + (r == 0 ? 0 : 1));
484 /* acols: The columns that will be accessed. */
485 /* scols: The columns that will be accessed or skipped. */
487 /* Our I/O request doesn't span all child vdevs. */
489 scols = MIN(dcols, roundup(bc, nparity + 1));
495 ASSERT3U(acols, <=, scols);
497 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
500 rm->rm_scols = scols;
502 rm->rm_skipstart = bc;
503 rm->rm_missingdata = 0;
504 rm->rm_missingparity = 0;
505 rm->rm_firstdatacol = nparity;
506 rm->rm_datacopy = NULL;
509 rm->rm_ecksuminjected = 0;
513 for (c = 0; c < scols; c++) {
518 coff += 1ULL << unit_shift;
520 rm->rm_col[c].rc_devidx = col;
521 rm->rm_col[c].rc_offset = coff;
522 rm->rm_col[c].rc_data = NULL;
523 rm->rm_col[c].rc_gdata = NULL;
524 rm->rm_col[c].rc_error = 0;
525 rm->rm_col[c].rc_tried = 0;
526 rm->rm_col[c].rc_skipped = 0;
529 rm->rm_col[c].rc_size = 0;
531 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
533 rm->rm_col[c].rc_size = q << unit_shift;
535 asize += rm->rm_col[c].rc_size;
538 ASSERT3U(asize, ==, tot << unit_shift);
539 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
540 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
541 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
542 ASSERT3U(rm->rm_nskip, <=, nparity);
545 for (c = 0; c < rm->rm_firstdatacol; c++) {
546 rm->rm_col[c].rc_data =
547 zio_buf_alloc(rm->rm_col[c].rc_size);
550 rm->rm_col[c].rc_data = data;
552 for (c = c + 1; c < acols; c++) {
553 rm->rm_col[c].rc_data =
554 (char *)rm->rm_col[c - 1].rc_data +
555 rm->rm_col[c - 1].rc_size;
560 * If all data stored spans all columns, there's a danger that parity
561 * will always be on the same device and, since parity isn't read
562 * during normal operation, that that device's I/O bandwidth won't be
563 * used effectively. We therefore switch the parity every 1MB.
565 * ... at least that was, ostensibly, the theory. As a practical
566 * matter unless we juggle the parity between all devices evenly, we
567 * won't see any benefit. Further, occasional writes that aren't a
568 * multiple of the LCM of the number of children and the minimum
569 * stripe width are sufficient to avoid pessimal behavior.
570 * Unfortunately, this decision created an implicit on-disk format
571 * requirement that we need to support for all eternity, but only
572 * for single-parity RAID-Z.
574 * If we intend to skip a sector in the zeroth column for padding
575 * we must make sure to note this swap. We will never intend to
576 * skip the first column since at least one data and one parity
577 * column must appear in each row.
579 ASSERT(rm->rm_cols >= 2);
580 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
582 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
583 devidx = rm->rm_col[0].rc_devidx;
584 o = rm->rm_col[0].rc_offset;
585 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
586 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
587 rm->rm_col[1].rc_devidx = devidx;
588 rm->rm_col[1].rc_offset = o;
590 if (rm->rm_skipstart == 0)
591 rm->rm_skipstart = 1;
598 vdev_raidz_generate_parity_p(raidz_map_t *rm)
600 uint64_t *p, *src, pcount, ccount, i;
603 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
605 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
606 src = rm->rm_col[c].rc_data;
607 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
608 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
610 if (c == rm->rm_firstdatacol) {
611 ASSERT(ccount == pcount);
612 for (i = 0; i < ccount; i++, src++, p++) {
616 ASSERT(ccount <= pcount);
617 for (i = 0; i < ccount; i++, src++, p++) {
625 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
627 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
630 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
631 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
632 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
634 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
635 src = rm->rm_col[c].rc_data;
636 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
637 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
639 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
641 if (c == rm->rm_firstdatacol) {
642 ASSERT(ccnt == pcnt || ccnt == 0);
643 for (i = 0; i < ccnt; i++, src++, p++, q++) {
647 for (; i < pcnt; i++, src++, p++, q++) {
652 ASSERT(ccnt <= pcnt);
655 * Apply the algorithm described above by multiplying
656 * the previous result and adding in the new value.
658 for (i = 0; i < ccnt; i++, src++, p++, q++) {
661 VDEV_RAIDZ_64MUL_2(*q, mask);
666 * Treat short columns as though they are full of 0s.
667 * Note that there's therefore nothing needed for P.
669 for (; i < pcnt; i++, q++) {
670 VDEV_RAIDZ_64MUL_2(*q, mask);
677 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
679 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
682 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
683 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
684 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
685 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
686 rm->rm_col[VDEV_RAIDZ_R].rc_size);
688 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
689 src = rm->rm_col[c].rc_data;
690 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
691 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
692 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
694 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
696 if (c == rm->rm_firstdatacol) {
697 ASSERT(ccnt == pcnt || ccnt == 0);
698 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
703 for (; i < pcnt; i++, src++, p++, q++, r++) {
709 ASSERT(ccnt <= pcnt);
712 * Apply the algorithm described above by multiplying
713 * the previous result and adding in the new value.
715 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
718 VDEV_RAIDZ_64MUL_2(*q, mask);
721 VDEV_RAIDZ_64MUL_4(*r, mask);
726 * Treat short columns as though they are full of 0s.
727 * Note that there's therefore nothing needed for P.
729 for (; i < pcnt; i++, q++, r++) {
730 VDEV_RAIDZ_64MUL_2(*q, mask);
731 VDEV_RAIDZ_64MUL_4(*r, mask);
738 * Generate RAID parity in the first virtual columns according to the number of
739 * parity columns available.
742 vdev_raidz_generate_parity(raidz_map_t *rm)
744 switch (rm->rm_firstdatacol) {
746 vdev_raidz_generate_parity_p(rm);
749 vdev_raidz_generate_parity_pq(rm);
752 vdev_raidz_generate_parity_pqr(rm);
755 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
760 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
762 uint64_t *dst, *src, xcount, ccount, count, i;
767 ASSERT(x >= rm->rm_firstdatacol);
768 ASSERT(x < rm->rm_cols);
770 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
771 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
774 src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
775 dst = rm->rm_col[x].rc_data;
776 for (i = 0; i < xcount; i++, dst++, src++) {
780 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
781 src = rm->rm_col[c].rc_data;
782 dst = rm->rm_col[x].rc_data;
787 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
788 count = MIN(ccount, xcount);
790 for (i = 0; i < count; i++, dst++, src++) {
795 return (1 << VDEV_RAIDZ_P);
799 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
801 uint64_t *dst, *src, xcount, ccount, count, mask, i;
808 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
809 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
811 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
812 src = rm->rm_col[c].rc_data;
813 dst = rm->rm_col[x].rc_data;
818 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
820 count = MIN(ccount, xcount);
822 if (c == rm->rm_firstdatacol) {
823 for (i = 0; i < count; i++, dst++, src++) {
826 for (; i < xcount; i++, dst++) {
831 for (i = 0; i < count; i++, dst++, src++) {
832 VDEV_RAIDZ_64MUL_2(*dst, mask);
836 for (; i < xcount; i++, dst++) {
837 VDEV_RAIDZ_64MUL_2(*dst, mask);
842 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
843 dst = rm->rm_col[x].rc_data;
844 exp = 255 - (rm->rm_cols - 1 - x);
846 for (i = 0; i < xcount; i++, dst++, src++) {
848 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
849 *b = vdev_raidz_exp2(*b, exp);
853 return (1 << VDEV_RAIDZ_Q);
857 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
859 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
861 uint64_t xsize, ysize, i;
867 ASSERT(x >= rm->rm_firstdatacol);
868 ASSERT(y < rm->rm_cols);
870 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
873 * Move the parity data aside -- we're going to compute parity as
874 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
875 * reuse the parity generation mechanism without trashing the actual
876 * parity so we make those columns appear to be full of zeros by
877 * setting their lengths to zero.
879 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
880 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
881 xsize = rm->rm_col[x].rc_size;
882 ysize = rm->rm_col[y].rc_size;
884 rm->rm_col[VDEV_RAIDZ_P].rc_data =
885 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
886 rm->rm_col[VDEV_RAIDZ_Q].rc_data =
887 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
888 rm->rm_col[x].rc_size = 0;
889 rm->rm_col[y].rc_size = 0;
891 vdev_raidz_generate_parity_pq(rm);
893 rm->rm_col[x].rc_size = xsize;
894 rm->rm_col[y].rc_size = ysize;
898 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
899 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
900 xd = rm->rm_col[x].rc_data;
901 yd = rm->rm_col[y].rc_data;
905 * Pxy = P + D_x + D_y
906 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
908 * We can then solve for D_x:
909 * D_x = A * (P + Pxy) + B * (Q + Qxy)
911 * A = 2^(x - y) * (2^(x - y) + 1)^-1
912 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
914 * With D_x in hand, we can easily solve for D_y:
915 * D_y = P + Pxy + D_x
918 a = vdev_raidz_pow2[255 + x - y];
919 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
920 tmp = 255 - vdev_raidz_log2[a ^ 1];
922 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
923 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
925 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
926 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
927 vdev_raidz_exp2(*q ^ *qxy, bexp);
930 *yd = *p ^ *pxy ^ *xd;
933 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
934 rm->rm_col[VDEV_RAIDZ_P].rc_size);
935 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
936 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
939 * Restore the saved parity data.
941 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
942 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
944 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
949 * In the general case of reconstruction, we must solve the system of linear
950 * equations defined by the coeffecients used to generate parity as well as
951 * the contents of the data and parity disks. This can be expressed with
952 * vectors for the original data (D) and the actual data (d) and parity (p)
953 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
957 * | V | | D_0 | | p_m-1 |
958 * | | x | : | = | d_0 |
959 * | I | | D_n-1 | | : |
960 * | | ~~ ~~ | d_n-1 |
963 * I is simply a square identity matrix of size n, and V is a vandermonde
964 * matrix defined by the coeffecients we chose for the various parity columns
965 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
966 * computation as well as linear separability.
969 * | 1 .. 1 1 1 | | p_0 |
970 * | 2^n-1 .. 4 2 1 | __ __ | : |
971 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
972 * | 1 .. 0 0 0 | | D_1 | | d_0 |
973 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
974 * | : : : : | | : | | d_2 |
975 * | 0 .. 1 0 0 | | D_n-1 | | : |
976 * | 0 .. 0 1 0 | ~~ ~~ | : |
977 * | 0 .. 0 0 1 | | d_n-1 |
980 * Note that I, V, d, and p are known. To compute D, we must invert the
981 * matrix and use the known data and parity values to reconstruct the unknown
982 * data values. We begin by removing the rows in V|I and d|p that correspond
983 * to failed or missing columns; we then make V|I square (n x n) and d|p
984 * sized n by removing rows corresponding to unused parity from the bottom up
985 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
986 * using Gauss-Jordan elimination. In the example below we use m=3 parity
987 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
989 * | 1 1 1 1 1 1 1 1 |
990 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
991 * | 19 205 116 29 64 16 4 1 | / /
992 * | 1 0 0 0 0 0 0 0 | / /
993 * | 0 1 0 0 0 0 0 0 | <--' /
994 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
995 * | 0 0 0 1 0 0 0 0 |
996 * | 0 0 0 0 1 0 0 0 |
997 * | 0 0 0 0 0 1 0 0 |
998 * | 0 0 0 0 0 0 1 0 |
999 * | 0 0 0 0 0 0 0 1 |
1002 * | 1 1 1 1 1 1 1 1 |
1003 * | 19 205 116 29 64 16 4 1 |
1004 * | 1 0 0 0 0 0 0 0 |
1005 * (V|I)' = | 0 0 0 1 0 0 0 0 |
1006 * | 0 0 0 0 1 0 0 0 |
1007 * | 0 0 0 0 0 1 0 0 |
1008 * | 0 0 0 0 0 0 1 0 |
1009 * | 0 0 0 0 0 0 0 1 |
1012 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1013 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1014 * matrix is not singular.
1016 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1017 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1018 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1019 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1020 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1021 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1022 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1023 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1026 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1027 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1028 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1029 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1030 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1031 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1032 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1033 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1036 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1037 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1038 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1039 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1040 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1041 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1042 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1043 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1046 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1047 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1048 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1049 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1050 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1051 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1052 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1053 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1056 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1057 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1058 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1059 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1060 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1061 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1062 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1063 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1066 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1067 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1068 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1069 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1070 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1071 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1072 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1073 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1076 * | 0 0 1 0 0 0 0 0 |
1077 * | 167 100 5 41 159 169 217 208 |
1078 * | 166 100 4 40 158 168 216 209 |
1079 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1080 * | 0 0 0 0 1 0 0 0 |
1081 * | 0 0 0 0 0 1 0 0 |
1082 * | 0 0 0 0 0 0 1 0 |
1083 * | 0 0 0 0 0 0 0 1 |
1086 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1087 * of the missing data.
1089 * As is apparent from the example above, the only non-trivial rows in the
1090 * inverse matrix correspond to the data disks that we're trying to
1091 * reconstruct. Indeed, those are the only rows we need as the others would
1092 * only be useful for reconstructing data known or assumed to be valid. For
1093 * that reason, we only build the coefficients in the rows that correspond to
1099 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1105 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1108 * Fill in the missing rows of interest.
1110 for (i = 0; i < nmap; i++) {
1111 ASSERT3S(0, <=, map[i]);
1112 ASSERT3S(map[i], <=, 2);
1119 for (j = 0; j < n; j++) {
1123 rows[i][j] = vdev_raidz_pow2[pow];
1129 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1130 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1136 * Assert that the first nmissing entries from the array of used
1137 * columns correspond to parity columns and that subsequent entries
1138 * correspond to data columns.
1140 for (i = 0; i < nmissing; i++) {
1141 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1143 for (; i < n; i++) {
1144 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1148 * First initialize the storage where we'll compute the inverse rows.
1150 for (i = 0; i < nmissing; i++) {
1151 for (j = 0; j < n; j++) {
1152 invrows[i][j] = (i == j) ? 1 : 0;
1157 * Subtract all trivial rows from the rows of consequence.
1159 for (i = 0; i < nmissing; i++) {
1160 for (j = nmissing; j < n; j++) {
1161 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1162 jj = used[j] - rm->rm_firstdatacol;
1164 invrows[i][j] = rows[i][jj];
1170 * For each of the rows of interest, we must normalize it and subtract
1171 * a multiple of it from the other rows.
1173 for (i = 0; i < nmissing; i++) {
1174 for (j = 0; j < missing[i]; j++) {
1175 ASSERT0(rows[i][j]);
1177 ASSERT3U(rows[i][missing[i]], !=, 0);
1180 * Compute the inverse of the first element and multiply each
1181 * element in the row by that value.
1183 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1185 for (j = 0; j < n; j++) {
1186 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1187 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1190 for (ii = 0; ii < nmissing; ii++) {
1194 ASSERT3U(rows[ii][missing[i]], !=, 0);
1196 log = vdev_raidz_log2[rows[ii][missing[i]]];
1198 for (j = 0; j < n; j++) {
1200 vdev_raidz_exp2(rows[i][j], log);
1202 vdev_raidz_exp2(invrows[i][j], log);
1208 * Verify that the data that is left in the rows are properly part of
1209 * an identity matrix.
1211 for (i = 0; i < nmissing; i++) {
1212 for (j = 0; j < n; j++) {
1213 if (j == missing[i]) {
1214 ASSERT3U(rows[i][j], ==, 1);
1216 ASSERT0(rows[i][j]);
1223 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1224 int *missing, uint8_t **invrows, const uint8_t *used)
1229 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1230 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1234 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1238 psize = sizeof (invlog[0][0]) * n * nmissing;
1239 p = kmem_alloc(psize, KM_SLEEP);
1241 for (pp = p, i = 0; i < nmissing; i++) {
1246 for (i = 0; i < nmissing; i++) {
1247 for (j = 0; j < n; j++) {
1248 ASSERT3U(invrows[i][j], !=, 0);
1249 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1253 for (i = 0; i < n; i++) {
1255 ASSERT3U(c, <, rm->rm_cols);
1257 src = rm->rm_col[c].rc_data;
1258 ccount = rm->rm_col[c].rc_size;
1259 for (j = 0; j < nmissing; j++) {
1260 cc = missing[j] + rm->rm_firstdatacol;
1261 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1262 ASSERT3U(cc, <, rm->rm_cols);
1263 ASSERT3U(cc, !=, c);
1265 dst[j] = rm->rm_col[cc].rc_data;
1266 dcount[j] = rm->rm_col[cc].rc_size;
1269 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1271 for (x = 0; x < ccount; x++, src++) {
1273 log = vdev_raidz_log2[*src];
1275 for (cc = 0; cc < nmissing; cc++) {
1276 if (x >= dcount[cc])
1282 if ((ll = log + invlog[cc][i]) >= 255)
1284 val = vdev_raidz_pow2[ll];
1295 kmem_free(p, psize);
1299 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1303 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1304 int parity_map[VDEV_RAIDZ_MAXPARITY];
1309 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1310 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1316 n = rm->rm_cols - rm->rm_firstdatacol;
1319 * Figure out which data columns are missing.
1322 for (t = 0; t < ntgts; t++) {
1323 if (tgts[t] >= rm->rm_firstdatacol) {
1324 missing_rows[nmissing_rows++] =
1325 tgts[t] - rm->rm_firstdatacol;
1330 * Figure out which parity columns to use to help generate the missing
1333 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1335 ASSERT(c < rm->rm_firstdatacol);
1338 * Skip any targeted parity columns.
1340 if (c == tgts[tt]) {
1352 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1354 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1355 nmissing_rows * n + sizeof (used[0]) * n;
1356 p = kmem_alloc(psize, KM_SLEEP);
1358 for (pp = p, i = 0; i < nmissing_rows; i++) {
1366 for (i = 0; i < nmissing_rows; i++) {
1367 used[i] = parity_map[i];
1370 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1371 if (tt < nmissing_rows &&
1372 c == missing_rows[tt] + rm->rm_firstdatacol) {
1383 * Initialize the interesting rows of the matrix.
1385 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1388 * Invert the matrix.
1390 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1394 * Reconstruct the missing data using the generated matrix.
1396 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1399 kmem_free(p, psize);
1405 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1407 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1411 int nbadparity, nbaddata;
1412 int parity_valid[VDEV_RAIDZ_MAXPARITY];
1415 * The tgts list must already be sorted.
1417 for (i = 1; i < nt; i++) {
1418 ASSERT(t[i] > t[i - 1]);
1421 nbadparity = rm->rm_firstdatacol;
1422 nbaddata = rm->rm_cols - nbadparity;
1424 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1425 if (c < rm->rm_firstdatacol)
1426 parity_valid[c] = B_FALSE;
1428 if (i < nt && c == t[i]) {
1431 } else if (rm->rm_col[c].rc_error != 0) {
1433 } else if (c >= rm->rm_firstdatacol) {
1436 parity_valid[c] = B_TRUE;
1441 ASSERT(ntgts >= nt);
1442 ASSERT(nbaddata >= 0);
1443 ASSERT(nbaddata + nbadparity == ntgts);
1445 dt = &tgts[nbadparity];
1448 * See if we can use any of our optimized reconstruction routines.
1450 if (!vdev_raidz_default_to_general) {
1453 if (parity_valid[VDEV_RAIDZ_P])
1454 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1456 ASSERT(rm->rm_firstdatacol > 1);
1458 if (parity_valid[VDEV_RAIDZ_Q])
1459 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1461 ASSERT(rm->rm_firstdatacol > 2);
1465 ASSERT(rm->rm_firstdatacol > 1);
1467 if (parity_valid[VDEV_RAIDZ_P] &&
1468 parity_valid[VDEV_RAIDZ_Q])
1469 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1471 ASSERT(rm->rm_firstdatacol > 2);
1477 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1478 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1484 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1485 uint64_t *logical_ashift, uint64_t *physical_ashift)
1488 uint64_t nparity = vd->vdev_nparity;
1493 ASSERT(nparity > 0);
1495 if (nparity > VDEV_RAIDZ_MAXPARITY ||
1496 vd->vdev_children < nparity + 1) {
1497 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1498 return (SET_ERROR(EINVAL));
1501 vdev_open_children(vd);
1503 for (c = 0; c < vd->vdev_children; c++) {
1504 cvd = vd->vdev_child[c];
1506 if (cvd->vdev_open_error != 0) {
1507 lasterror = cvd->vdev_open_error;
1512 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1513 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1514 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1515 *physical_ashift = MAX(*physical_ashift,
1516 cvd->vdev_physical_ashift);
1519 *asize *= vd->vdev_children;
1520 *max_asize *= vd->vdev_children;
1522 if (numerrors > nparity) {
1523 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1531 vdev_raidz_close(vdev_t *vd)
1535 for (c = 0; c < vd->vdev_children; c++)
1536 vdev_close(vd->vdev_child[c]);
1541 * Handle a read or write I/O to a RAID-Z dump device.
1543 * The dump device is in a unique situation compared to other ZFS datasets:
1544 * writing to this device should be as simple and fast as possible. In
1545 * addition, durability matters much less since the dump will be extracted
1546 * once the machine reboots. For that reason, this function eschews parity for
1547 * performance and simplicity. The dump device uses the checksum setting
1548 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1551 * Blocks of size 128 KB have been preallocated for this volume. I/Os less than
1552 * 128 KB will not fill an entire block; in addition, they may not be properly
1553 * aligned. In that case, this function uses the preallocated 128 KB block and
1554 * omits reading or writing any "empty" portions of that block, as opposed to
1555 * allocating a fresh appropriately-sized block.
1557 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1559 * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1561 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1562 * allocated which spans all five child vdevs. 8 KB of data would be written to
1563 * each of four vdevs, with the fifth containing the parity bits.
1565 * parity data data data data
1566 * | PP | XX | XX | XX | XX |
1569 * 8 KB parity ------8 KB data blocks------
1571 * However, when writing to the dump device, the behavior is different:
1573 * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1575 * Unlike the normal RAID-Z case in which the block is allocated based on the
1576 * I/O size, reads and writes here always use a 128 KB logical I/O size. If the
1577 * I/O size is less than 128 KB, only the actual portions of data are written.
1578 * In this example the data is written to the third data vdev since that vdev
1579 * contains the offset [64 KB, 96 KB).
1581 * parity data data data data
1587 * As a result, an individual I/O may not span all child vdevs; moreover, a
1588 * small I/O may only operate on a single child vdev.
1590 * Note that since there are no parity bits calculated or written, this format
1591 * remains the same no matter how many parity bits are used in a normal RAID-Z
1592 * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above
1595 * parity parity parity data data data data
1596 * | | | | | | XX | |
1602 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1603 uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1605 vdev_t *tvd = vd->vdev_top;
1611 uint64_t start, end, colstart, colend;
1612 uint64_t coloffset, colsize, colskip;
1614 int flags = doread ? BIO_READ : BIO_WRITE;
1619 * Don't write past the end of the block
1621 VERIFY3U(offset + size, <=, origoffset + SPA_MAXBLOCKSIZE);
1627 * Allocate a RAID-Z map for this block. Note that this block starts
1628 * from the "original" offset, this is, the offset of the extent which
1629 * contains the requisite offset of the data being read or written.
1631 * Even if this I/O operation doesn't span the full block size, let's
1632 * treat the on-disk format as if the only blocks are the complete 128
1635 rm = vdev_raidz_map_alloc(data - (offset - origoffset),
1636 SPA_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift, vd->vdev_children,
1639 coloffset = origoffset;
1641 for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1642 c++, coloffset += rc->rc_size) {
1643 rc = &rm->rm_col[c];
1644 cvd = vd->vdev_child[rc->rc_devidx];
1647 * Find the start and end of this column in the RAID-Z map,
1648 * keeping in mind that the stated size and offset of the
1649 * operation may not fill the entire column for this vdev.
1651 * If any portion of the data spans this column, issue the
1652 * appropriate operation to the vdev.
1654 if (coloffset + rc->rc_size <= start)
1656 if (coloffset >= end)
1659 colstart = MAX(coloffset, start);
1660 colend = MIN(end, coloffset + rc->rc_size);
1661 colsize = colend - colstart;
1662 colskip = colstart - coloffset;
1664 VERIFY3U(colsize, <=, rc->rc_size);
1665 VERIFY3U(colskip, <=, rc->rc_size);
1668 * Note that the child vdev will have a vdev label at the start
1669 * of its range of offsets, hence the need for
1670 * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another
1671 * example of why this calculation is needed.
1673 if ((err = vdev_disk_physio(cvd,
1674 ((char *)rc->rc_data) + colskip, colsize,
1675 VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1676 flags, isdump)) != 0)
1680 vdev_raidz_map_free(rm);
1688 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1691 uint64_t ashift = vd->vdev_top->vdev_ashift;
1692 uint64_t cols = vd->vdev_children;
1693 uint64_t nparity = vd->vdev_nparity;
1695 asize = ((psize - 1) >> ashift) + 1;
1696 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1697 asize = roundup(asize, nparity + 1) << ashift;
1703 vdev_raidz_child_done(zio_t *zio)
1705 raidz_col_t *rc = zio->io_private;
1707 rc->rc_error = zio->io_error;
1713 * Start an IO operation on a RAIDZ VDev
1716 * - For write operations:
1717 * 1. Generate the parity data
1718 * 2. Create child zio write operations to each column's vdev, for both
1720 * 3. If the column skips any sectors for padding, create optional dummy
1721 * write zio children for those areas to improve aggregation continuity.
1722 * - For read operations:
1723 * 1. Create child zio read operations to each data column's vdev to read
1724 * the range of data required for zio.
1725 * 2. If this is a scrub or resilver operation, or if any of the data
1726 * vdevs have had errors, then create zio read operations to the parity
1727 * columns' VDevs as well.
1730 vdev_raidz_io_start(zio_t *zio)
1732 vdev_t *vd = zio->io_vd;
1733 vdev_t *tvd = vd->vdev_top;
1739 rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
1740 zio->io_type == ZIO_TYPE_FREE,
1741 tvd->vdev_ashift, vd->vdev_children,
1745 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1747 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1749 if (zio->io_type == ZIO_TYPE_FREE) {
1750 for (c = 0; c < rm->rm_cols; c++) {
1751 rc = &rm->rm_col[c];
1752 cvd = vd->vdev_child[rc->rc_devidx];
1753 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1754 rc->rc_offset, rc->rc_data, rc->rc_size,
1755 zio->io_type, zio->io_priority, 0,
1756 vdev_raidz_child_done, rc));
1758 return (ZIO_PIPELINE_CONTINUE);
1761 if (zio->io_type == ZIO_TYPE_WRITE) {
1762 vdev_raidz_generate_parity(rm);
1764 for (c = 0; c < rm->rm_cols; c++) {
1765 rc = &rm->rm_col[c];
1766 cvd = vd->vdev_child[rc->rc_devidx];
1767 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1768 rc->rc_offset, rc->rc_data, rc->rc_size,
1769 zio->io_type, zio->io_priority, 0,
1770 vdev_raidz_child_done, rc));
1774 * Generate optional I/Os for any skipped sectors to improve
1775 * aggregation contiguity.
1777 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1778 ASSERT(c <= rm->rm_scols);
1779 if (c == rm->rm_scols)
1781 rc = &rm->rm_col[c];
1782 cvd = vd->vdev_child[rc->rc_devidx];
1783 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1784 rc->rc_offset + rc->rc_size, NULL,
1785 1 << tvd->vdev_ashift,
1786 zio->io_type, zio->io_priority,
1787 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1790 return (ZIO_PIPELINE_CONTINUE);
1793 ASSERT(zio->io_type == ZIO_TYPE_READ);
1796 * Iterate over the columns in reverse order so that we hit the parity
1797 * last -- any errors along the way will force us to read the parity.
1799 for (c = rm->rm_cols - 1; c >= 0; c--) {
1800 rc = &rm->rm_col[c];
1801 cvd = vd->vdev_child[rc->rc_devidx];
1802 if (!vdev_readable(cvd)) {
1803 if (c >= rm->rm_firstdatacol)
1804 rm->rm_missingdata++;
1806 rm->rm_missingparity++;
1807 rc->rc_error = SET_ERROR(ENXIO);
1808 rc->rc_tried = 1; /* don't even try */
1812 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1813 if (c >= rm->rm_firstdatacol)
1814 rm->rm_missingdata++;
1816 rm->rm_missingparity++;
1817 rc->rc_error = SET_ERROR(ESTALE);
1821 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1822 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1823 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1824 rc->rc_offset, rc->rc_data, rc->rc_size,
1825 zio->io_type, zio->io_priority, 0,
1826 vdev_raidz_child_done, rc));
1830 return (ZIO_PIPELINE_CONTINUE);
1835 * Report a checksum error for a child of a RAID-Z device.
1838 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1840 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1842 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1843 zio_bad_cksum_t zbc;
1844 raidz_map_t *rm = zio->io_vsd;
1846 mutex_enter(&vd->vdev_stat_lock);
1847 vd->vdev_stat.vs_checksum_errors++;
1848 mutex_exit(&vd->vdev_stat_lock);
1850 zbc.zbc_has_cksum = 0;
1851 zbc.zbc_injected = rm->rm_ecksuminjected;
1853 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1854 rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1860 * We keep track of whether or not there were any injected errors, so that
1861 * any ereports we generate can note it.
1864 raidz_checksum_verify(zio_t *zio)
1866 zio_bad_cksum_t zbc;
1867 raidz_map_t *rm = zio->io_vsd;
1869 int ret = zio_checksum_error(zio, &zbc);
1870 if (ret != 0 && zbc.zbc_injected != 0)
1871 rm->rm_ecksuminjected = 1;
1877 * Generate the parity from the data columns. If we tried and were able to
1878 * read the parity without error, verify that the generated parity matches the
1879 * data we read. If it doesn't, we fire off a checksum error. Return the
1880 * number such failures.
1883 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1885 void *orig[VDEV_RAIDZ_MAXPARITY];
1889 blkptr_t *bp = zio->io_bp;
1890 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1891 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1893 if (checksum == ZIO_CHECKSUM_NOPARITY)
1896 for (c = 0; c < rm->rm_firstdatacol; c++) {
1897 rc = &rm->rm_col[c];
1898 if (!rc->rc_tried || rc->rc_error != 0)
1900 orig[c] = zio_buf_alloc(rc->rc_size);
1901 bcopy(rc->rc_data, orig[c], rc->rc_size);
1904 vdev_raidz_generate_parity(rm);
1906 for (c = 0; c < rm->rm_firstdatacol; c++) {
1907 rc = &rm->rm_col[c];
1908 if (!rc->rc_tried || rc->rc_error != 0)
1910 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1911 raidz_checksum_error(zio, rc, orig[c]);
1912 rc->rc_error = SET_ERROR(ECKSUM);
1915 zio_buf_free(orig[c], rc->rc_size);
1922 * Keep statistics on all the ways that we used parity to correct data.
1924 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1927 vdev_raidz_worst_error(raidz_map_t *rm)
1931 for (int c = 0; c < rm->rm_cols; c++)
1932 error = zio_worst_error(error, rm->rm_col[c].rc_error);
1938 * Iterate over all combinations of bad data and attempt a reconstruction.
1939 * Note that the algorithm below is non-optimal because it doesn't take into
1940 * account how reconstruction is actually performed. For example, with
1941 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1942 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1943 * cases we'd only use parity information in column 0.
1946 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1948 raidz_map_t *rm = zio->io_vsd;
1950 void *orig[VDEV_RAIDZ_MAXPARITY];
1951 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1952 int *tgts = &tstore[1];
1953 int current, next, i, c, n;
1956 ASSERT(total_errors < rm->rm_firstdatacol);
1959 * This simplifies one edge condition.
1963 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1965 * Initialize the targets array by finding the first n columns
1966 * that contain no error.
1968 * If there were no data errors, we need to ensure that we're
1969 * always explicitly attempting to reconstruct at least one
1970 * data column. To do this, we simply push the highest target
1971 * up into the data columns.
1973 for (c = 0, i = 0; i < n; i++) {
1974 if (i == n - 1 && data_errors == 0 &&
1975 c < rm->rm_firstdatacol) {
1976 c = rm->rm_firstdatacol;
1979 while (rm->rm_col[c].rc_error != 0) {
1981 ASSERT3S(c, <, rm->rm_cols);
1988 * Setting tgts[n] simplifies the other edge condition.
1990 tgts[n] = rm->rm_cols;
1993 * These buffers were allocated in previous iterations.
1995 for (i = 0; i < n - 1; i++) {
1996 ASSERT(orig[i] != NULL);
1999 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2002 next = tgts[current];
2004 while (current != n) {
2005 tgts[current] = next;
2009 * Save off the original data that we're going to
2010 * attempt to reconstruct.
2012 for (i = 0; i < n; i++) {
2013 ASSERT(orig[i] != NULL);
2016 ASSERT3S(c, <, rm->rm_cols);
2017 rc = &rm->rm_col[c];
2018 bcopy(rc->rc_data, orig[i], rc->rc_size);
2022 * Attempt a reconstruction and exit the outer loop on
2025 code = vdev_raidz_reconstruct(rm, tgts, n);
2026 if (raidz_checksum_verify(zio) == 0) {
2027 atomic_inc_64(&raidz_corrected[code]);
2029 for (i = 0; i < n; i++) {
2031 rc = &rm->rm_col[c];
2032 ASSERT(rc->rc_error == 0);
2034 raidz_checksum_error(zio, rc,
2036 rc->rc_error = SET_ERROR(ECKSUM);
2044 * Restore the original data.
2046 for (i = 0; i < n; i++) {
2048 rc = &rm->rm_col[c];
2049 bcopy(orig[i], rc->rc_data, rc->rc_size);
2054 * Find the next valid column after the current
2057 for (next = tgts[current] + 1;
2058 next < rm->rm_cols &&
2059 rm->rm_col[next].rc_error != 0; next++)
2062 ASSERT(next <= tgts[current + 1]);
2065 * If that spot is available, we're done here.
2067 if (next != tgts[current + 1])
2071 * Otherwise, find the next valid column after
2072 * the previous position.
2074 for (c = tgts[current - 1] + 1;
2075 rm->rm_col[c].rc_error != 0; c++)
2081 } while (current != n);
2086 for (i = 0; i < n; i++) {
2087 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2094 * Complete an IO operation on a RAIDZ VDev
2097 * - For write operations:
2098 * 1. Check for errors on the child IOs.
2099 * 2. Return, setting an error code if too few child VDevs were written
2100 * to reconstruct the data later. Note that partial writes are
2101 * considered successful if they can be reconstructed at all.
2102 * - For read operations:
2103 * 1. Check for errors on the child IOs.
2104 * 2. If data errors occurred:
2105 * a. Try to reassemble the data from the parity available.
2106 * b. If we haven't yet read the parity drives, read them now.
2107 * c. If all parity drives have been read but the data still doesn't
2108 * reassemble with a correct checksum, then try combinatorial
2110 * d. If that doesn't work, return an error.
2111 * 3. If there were unexpected errors or this is a resilver operation,
2112 * rewrite the vdevs that had errors.
2115 vdev_raidz_io_done(zio_t *zio)
2117 vdev_t *vd = zio->io_vd;
2119 raidz_map_t *rm = zio->io_vsd;
2121 int unexpected_errors = 0;
2122 int parity_errors = 0;
2123 int parity_untried = 0;
2124 int data_errors = 0;
2125 int total_errors = 0;
2127 int tgts[VDEV_RAIDZ_MAXPARITY];
2130 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
2132 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2133 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2135 for (c = 0; c < rm->rm_cols; c++) {
2136 rc = &rm->rm_col[c];
2139 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
2141 if (c < rm->rm_firstdatacol)
2146 if (!rc->rc_skipped)
2147 unexpected_errors++;
2150 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2155 if (zio->io_type == ZIO_TYPE_WRITE) {
2157 * XXX -- for now, treat partial writes as a success.
2158 * (If we couldn't write enough columns to reconstruct
2159 * the data, the I/O failed. Otherwise, good enough.)
2161 * Now that we support write reallocation, it would be better
2162 * to treat partial failure as real failure unless there are
2163 * no non-degraded top-level vdevs left, and not update DTLs
2164 * if we intend to reallocate.
2167 if (total_errors > rm->rm_firstdatacol)
2168 zio->io_error = vdev_raidz_worst_error(rm);
2171 } else if (zio->io_type == ZIO_TYPE_FREE) {
2175 ASSERT(zio->io_type == ZIO_TYPE_READ);
2177 * There are three potential phases for a read:
2178 * 1. produce valid data from the columns read
2179 * 2. read all disks and try again
2180 * 3. perform combinatorial reconstruction
2182 * Each phase is progressively both more expensive and less likely to
2183 * occur. If we encounter more errors than we can repair or all phases
2184 * fail, we have no choice but to return an error.
2188 * If the number of errors we saw was correctable -- less than or equal
2189 * to the number of parity disks read -- attempt to produce data that
2190 * has a valid checksum. Naturally, this case applies in the absence of
2193 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2194 if (data_errors == 0) {
2195 if (raidz_checksum_verify(zio) == 0) {
2197 * If we read parity information (unnecessarily
2198 * as it happens since no reconstruction was
2199 * needed) regenerate and verify the parity.
2200 * We also regenerate parity when resilvering
2201 * so we can write it out to the failed device
2204 if (parity_errors + parity_untried <
2205 rm->rm_firstdatacol ||
2206 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2207 n = raidz_parity_verify(zio, rm);
2208 unexpected_errors += n;
2209 ASSERT(parity_errors + n <=
2210 rm->rm_firstdatacol);
2216 * We either attempt to read all the parity columns or
2217 * none of them. If we didn't try to read parity, we
2218 * wouldn't be here in the correctable case. There must
2219 * also have been fewer parity errors than parity
2220 * columns or, again, we wouldn't be in this code path.
2222 ASSERT(parity_untried == 0);
2223 ASSERT(parity_errors < rm->rm_firstdatacol);
2226 * Identify the data columns that reported an error.
2229 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2230 rc = &rm->rm_col[c];
2231 if (rc->rc_error != 0) {
2232 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2237 ASSERT(rm->rm_firstdatacol >= n);
2239 code = vdev_raidz_reconstruct(rm, tgts, n);
2241 if (raidz_checksum_verify(zio) == 0) {
2242 atomic_inc_64(&raidz_corrected[code]);
2245 * If we read more parity disks than were used
2246 * for reconstruction, confirm that the other
2247 * parity disks produced correct data. This
2248 * routine is suboptimal in that it regenerates
2249 * the parity that we already used in addition
2250 * to the parity that we're attempting to
2251 * verify, but this should be a relatively
2252 * uncommon case, and can be optimized if it
2253 * becomes a problem. Note that we regenerate
2254 * parity when resilvering so we can write it
2255 * out to failed devices later.
2257 if (parity_errors < rm->rm_firstdatacol - n ||
2258 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2259 n = raidz_parity_verify(zio, rm);
2260 unexpected_errors += n;
2261 ASSERT(parity_errors + n <=
2262 rm->rm_firstdatacol);
2271 * This isn't a typical situation -- either we got a read error or
2272 * a child silently returned bad data. Read every block so we can
2273 * try again with as much data and parity as we can track down. If
2274 * we've already been through once before, all children will be marked
2275 * as tried so we'll proceed to combinatorial reconstruction.
2277 unexpected_errors = 1;
2278 rm->rm_missingdata = 0;
2279 rm->rm_missingparity = 0;
2281 for (c = 0; c < rm->rm_cols; c++) {
2282 if (rm->rm_col[c].rc_tried)
2285 zio_vdev_io_redone(zio);
2287 rc = &rm->rm_col[c];
2290 zio_nowait(zio_vdev_child_io(zio, NULL,
2291 vd->vdev_child[rc->rc_devidx],
2292 rc->rc_offset, rc->rc_data, rc->rc_size,
2293 zio->io_type, zio->io_priority, 0,
2294 vdev_raidz_child_done, rc));
2295 } while (++c < rm->rm_cols);
2301 * At this point we've attempted to reconstruct the data given the
2302 * errors we detected, and we've attempted to read all columns. There
2303 * must, therefore, be one or more additional problems -- silent errors
2304 * resulting in invalid data rather than explicit I/O errors resulting
2305 * in absent data. We check if there is enough additional data to
2306 * possibly reconstruct the data and then perform combinatorial
2307 * reconstruction over all possible combinations. If that fails,
2310 if (total_errors > rm->rm_firstdatacol) {
2311 zio->io_error = vdev_raidz_worst_error(rm);
2313 } else if (total_errors < rm->rm_firstdatacol &&
2314 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2316 * If we didn't use all the available parity for the
2317 * combinatorial reconstruction, verify that the remaining
2318 * parity is correct.
2320 if (code != (1 << rm->rm_firstdatacol) - 1)
2321 (void) raidz_parity_verify(zio, rm);
2324 * We're here because either:
2326 * total_errors == rm_first_datacol, or
2327 * vdev_raidz_combrec() failed
2329 * In either case, there is enough bad data to prevent
2332 * Start checksum ereports for all children which haven't
2333 * failed, and the IO wasn't speculative.
2335 zio->io_error = SET_ERROR(ECKSUM);
2337 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2338 for (c = 0; c < rm->rm_cols; c++) {
2339 rc = &rm->rm_col[c];
2340 if (rc->rc_error == 0) {
2341 zio_bad_cksum_t zbc;
2342 zbc.zbc_has_cksum = 0;
2344 rm->rm_ecksuminjected;
2346 zfs_ereport_start_checksum(
2348 vd->vdev_child[rc->rc_devidx],
2349 zio, rc->rc_offset, rc->rc_size,
2350 (void *)(uintptr_t)c, &zbc);
2357 zio_checksum_verified(zio);
2359 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2360 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2362 * Use the good data we have in hand to repair damaged children.
2364 for (c = 0; c < rm->rm_cols; c++) {
2365 rc = &rm->rm_col[c];
2366 cvd = vd->vdev_child[rc->rc_devidx];
2368 if (rc->rc_error == 0)
2371 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2372 rc->rc_offset, rc->rc_data, rc->rc_size,
2373 ZIO_TYPE_WRITE, zio->io_priority,
2374 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2375 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2381 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2383 if (faulted > vd->vdev_nparity)
2384 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2385 VDEV_AUX_NO_REPLICAS);
2386 else if (degraded + faulted != 0)
2387 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2389 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2392 vdev_ops_t vdev_raidz_ops = {
2396 vdev_raidz_io_start,
2398 vdev_raidz_state_change,
2401 VDEV_TYPE_RAIDZ, /* name of this vdev type */
2402 B_FALSE /* not a leaf vdev */