4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26 * Copyright (c) 2014 Integros [integros.com]
29 #include <sys/zfs_context.h>
31 #include <sys/vdev_impl.h>
33 #include <sys/vdev_disk.h>
35 #include <sys/vdev_file.h>
36 #include <sys/vdev_raidz.h>
38 #include <sys/zio_checksum.h>
40 #include <sys/fs/zfs.h>
41 #include <sys/fm/fs/zfs.h>
45 * Virtual device vector for RAID-Z.
47 * This vdev supports single, double, and triple parity. For single parity,
48 * we use a simple XOR of all the data columns. For double or triple parity,
49 * we use a special case of Reed-Solomon coding. This extends the
50 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
51 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
52 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
53 * former is also based. The latter is designed to provide higher performance
56 * Note that the Plank paper claimed to support arbitrary N+M, but was then
57 * amended six years later identifying a critical flaw that invalidates its
58 * claims. Nevertheless, the technique can be adapted to work for up to
59 * triple parity. For additional parity, the amendment "Note: Correction to
60 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
61 * is viable, but the additional complexity means that write performance will
64 * All of the methods above operate on a Galois field, defined over the
65 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
66 * can be expressed with a single byte. Briefly, the operations on the
67 * field are defined as follows:
69 * o addition (+) is represented by a bitwise XOR
70 * o subtraction (-) is therefore identical to addition: A + B = A - B
71 * o multiplication of A by 2 is defined by the following bitwise expression:
76 * (A * 2)_4 = A_3 + A_7
77 * (A * 2)_3 = A_2 + A_7
78 * (A * 2)_2 = A_1 + A_7
82 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
83 * As an aside, this multiplication is derived from the error correcting
84 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
86 * Observe that any number in the field (except for 0) can be expressed as a
87 * power of 2 -- a generator for the field. We store a table of the powers of
88 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
89 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
90 * than field addition). The inverse of a field element A (A^-1) is therefore
91 * A ^ (255 - 1) = A^254.
93 * The up-to-three parity columns, P, Q, R over several data columns,
94 * D_0, ... D_n-1, can be expressed by field operations:
96 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
97 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
98 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
99 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
100 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
102 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
103 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
104 * independent coefficients. (There are no additional coefficients that have
105 * this property which is why the uncorrected Plank method breaks down.)
107 * See the reconstruction code below for how P, Q and R can used individually
108 * or in concert to recover missing data columns.
111 typedef struct raidz_col {
112 uint64_t rc_devidx; /* child device index for I/O */
113 uint64_t rc_offset; /* device offset */
114 uint64_t rc_size; /* I/O size */
115 abd_t *rc_abd; /* I/O data */
116 void *rc_gdata; /* used to store the "good" version */
117 int rc_error; /* I/O error for this device */
118 uint8_t rc_tried; /* Did we attempt this I/O column? */
119 uint8_t rc_skipped; /* Did we skip this I/O column? */
122 typedef struct raidz_map {
123 uint64_t rm_cols; /* Regular column count */
124 uint64_t rm_scols; /* Count including skipped columns */
125 uint64_t rm_bigcols; /* Number of oversized columns */
126 uint64_t rm_asize; /* Actual total I/O size */
127 uint64_t rm_missingdata; /* Count of missing data devices */
128 uint64_t rm_missingparity; /* Count of missing parity devices */
129 uint64_t rm_firstdatacol; /* First data column/parity count */
130 uint64_t rm_nskip; /* Skipped sectors for padding */
131 uint64_t rm_skipstart; /* Column index of padding start */
132 abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */
133 uintptr_t rm_reports; /* # of referencing checksum reports */
134 uint8_t rm_freed; /* map no longer has referencing ZIO */
135 uint8_t rm_ecksuminjected; /* checksum error was injected */
136 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
139 #define VDEV_RAIDZ_P 0
140 #define VDEV_RAIDZ_Q 1
141 #define VDEV_RAIDZ_R 2
143 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
144 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
147 * We provide a mechanism to perform the field multiplication operation on a
148 * 64-bit value all at once rather than a byte at a time. This works by
149 * creating a mask from the top bit in each byte and using that to
150 * conditionally apply the XOR of 0x1d.
152 #define VDEV_RAIDZ_64MUL_2(x, mask) \
154 (mask) = (x) & 0x8080808080808080ULL; \
155 (mask) = ((mask) << 1) - ((mask) >> 7); \
156 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
157 ((mask) & 0x1d1d1d1d1d1d1d1d); \
160 #define VDEV_RAIDZ_64MUL_4(x, mask) \
162 VDEV_RAIDZ_64MUL_2((x), mask); \
163 VDEV_RAIDZ_64MUL_2((x), mask); \
166 #define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE)
169 * Force reconstruction to use the general purpose method.
171 int vdev_raidz_default_to_general;
173 /* Powers of 2 in the Galois field defined above. */
174 static const uint8_t vdev_raidz_pow2[256] = {
175 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
176 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
177 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
178 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
179 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
180 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
181 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
182 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
183 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
184 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
185 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
186 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
187 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
188 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
189 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
190 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
191 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
192 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
193 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
194 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
195 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
196 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
197 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
198 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
199 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
200 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
201 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
202 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
203 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
204 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
205 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
206 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
208 /* Logs of 2 in the Galois field defined above. */
209 static const uint8_t vdev_raidz_log2[256] = {
210 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
211 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
212 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
213 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
214 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
215 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
216 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
217 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
218 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
219 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
220 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
221 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
222 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
223 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
224 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
225 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
226 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
227 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
228 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
229 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
230 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
231 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
232 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
233 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
234 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
235 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
236 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
237 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
238 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
239 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
240 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
241 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
244 static void vdev_raidz_generate_parity(raidz_map_t *rm);
247 * Multiply a given number by 2 raised to the given power.
250 vdev_raidz_exp2(uint_t a, int exp)
256 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
258 exp += vdev_raidz_log2[a];
262 return (vdev_raidz_pow2[exp]);
266 vdev_raidz_map_free(raidz_map_t *rm)
271 for (c = 0; c < rm->rm_firstdatacol; c++) {
272 if (rm->rm_col[c].rc_abd != NULL)
273 abd_free(rm->rm_col[c].rc_abd);
275 if (rm->rm_col[c].rc_gdata != NULL)
276 zio_buf_free(rm->rm_col[c].rc_gdata,
277 rm->rm_col[c].rc_size);
281 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
282 if (rm->rm_col[c].rc_abd != NULL)
283 abd_put(rm->rm_col[c].rc_abd);
284 size += rm->rm_col[c].rc_size;
287 if (rm->rm_abd_copy != NULL)
288 abd_free(rm->rm_abd_copy);
290 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
294 vdev_raidz_map_free_vsd(zio_t *zio)
296 raidz_map_t *rm = zio->io_vsd;
298 ASSERT0(rm->rm_freed);
301 if (rm->rm_reports == 0)
302 vdev_raidz_map_free(rm);
307 vdev_raidz_cksum_free(void *arg, size_t ignored)
309 raidz_map_t *rm = arg;
311 ASSERT3U(rm->rm_reports, >, 0);
313 if (--rm->rm_reports == 0 && rm->rm_freed != 0)
314 vdev_raidz_map_free(rm);
318 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
320 raidz_map_t *rm = zcr->zcr_cbdata;
321 size_t c = zcr->zcr_cbinfo;
324 const char *good = NULL;
327 if (good_data == NULL) {
328 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
332 if (c < rm->rm_firstdatacol) {
334 * The first time through, calculate the parity blocks for
335 * the good data (this relies on the fact that the good
336 * data never changes for a given logical ZIO)
338 if (rm->rm_col[0].rc_gdata == NULL) {
339 abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
344 * Set up the rm_col[]s to generate the parity for
345 * good_data, first saving the parity bufs and
346 * replacing them with buffers to hold the result.
348 for (x = 0; x < rm->rm_firstdatacol; x++) {
349 bad_parity[x] = rm->rm_col[x].rc_abd;
350 rm->rm_col[x].rc_gdata =
351 zio_buf_alloc(rm->rm_col[x].rc_size);
352 rm->rm_col[x].rc_abd =
353 abd_get_from_buf(rm->rm_col[x].rc_gdata,
354 rm->rm_col[x].rc_size);
357 /* fill in the data columns from good_data */
358 buf = (char *)good_data;
359 for (; x < rm->rm_cols; x++) {
360 abd_put(rm->rm_col[x].rc_abd);
361 rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
362 rm->rm_col[x].rc_size);
363 buf += rm->rm_col[x].rc_size;
367 * Construct the parity from the good data.
369 vdev_raidz_generate_parity(rm);
371 /* restore everything back to its original state */
372 for (x = 0; x < rm->rm_firstdatacol; x++) {
373 abd_put(rm->rm_col[x].rc_abd);
374 rm->rm_col[x].rc_abd = bad_parity[x];
378 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
379 abd_put(rm->rm_col[x].rc_abd);
380 rm->rm_col[x].rc_abd = abd_get_offset(
381 rm->rm_abd_copy, offset);
382 offset += rm->rm_col[x].rc_size;
386 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
387 good = rm->rm_col[c].rc_gdata;
389 /* adjust good_data to point at the start of our column */
392 for (x = rm->rm_firstdatacol; x < c; x++)
393 good += rm->rm_col[x].rc_size;
396 bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
397 /* we drop the ereport if it ends up that the data was good */
398 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
399 abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
403 * Invoked indirectly by zfs_ereport_start_checksum(), called
404 * below when our read operation fails completely. The main point
405 * is to keep a copy of everything we read from disk, so that at
406 * vdev_raidz_cksum_finish() time we can compare it with the good data.
409 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
411 size_t c = (size_t)(uintptr_t)arg;
414 raidz_map_t *rm = zio->io_vsd;
417 /* set up the report and bump the refcount */
418 zcr->zcr_cbdata = rm;
420 zcr->zcr_finish = vdev_raidz_cksum_finish;
421 zcr->zcr_free = vdev_raidz_cksum_free;
424 ASSERT3U(rm->rm_reports, >, 0);
426 if (rm->rm_abd_copy != NULL)
430 * It's the first time we're called for this raidz_map_t, so we need
431 * to copy the data aside; there's no guarantee that our zio's buffer
432 * won't be re-used for something else.
434 * Our parity data is already in separate buffers, so there's no need
439 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
440 size += rm->rm_col[c].rc_size;
443 abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
445 for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
446 raidz_col_t *col = &rm->rm_col[c];
447 abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
449 abd_copy(tmp, col->rc_abd, col->rc_size);
450 abd_put(col->rc_abd);
453 offset += col->rc_size;
455 ASSERT3U(offset, ==, size);
458 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
459 vdev_raidz_map_free_vsd,
460 vdev_raidz_cksum_report
464 * Divides the IO evenly across all child vdevs; usually, dcols is
465 * the number of children in the target vdev.
468 vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, boolean_t dofree,
469 uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
472 /* The starting RAIDZ (parent) vdev sector of the block. */
473 uint64_t b = offset >> unit_shift;
474 /* The zio's size in units of the vdev's minimum sector size. */
475 uint64_t s = size >> unit_shift;
476 /* The first column for this stripe. */
477 uint64_t f = b % dcols;
478 /* The starting byte offset on each child vdev. */
479 uint64_t o = (b / dcols) << unit_shift;
480 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
484 * "Quotient": The number of data sectors for this stripe on all but
485 * the "big column" child vdevs that also contain "remainder" data.
487 q = s / (dcols - nparity);
490 * "Remainder": The number of partial stripe data sectors in this I/O.
491 * This will add a sector to some, but not all, child vdevs.
493 r = s - q * (dcols - nparity);
495 /* The number of "big columns" - those which contain remainder data. */
496 bc = (r == 0 ? 0 : r + nparity);
499 * The total number of data and parity sectors associated with
502 tot = s + nparity * (q + (r == 0 ? 0 : 1));
504 /* acols: The columns that will be accessed. */
505 /* scols: The columns that will be accessed or skipped. */
507 /* Our I/O request doesn't span all child vdevs. */
509 scols = MIN(dcols, roundup(bc, nparity + 1));
515 ASSERT3U(acols, <=, scols);
517 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
520 rm->rm_scols = scols;
522 rm->rm_skipstart = bc;
523 rm->rm_missingdata = 0;
524 rm->rm_missingparity = 0;
525 rm->rm_firstdatacol = nparity;
526 rm->rm_abd_copy = NULL;
529 rm->rm_ecksuminjected = 0;
533 for (c = 0; c < scols; c++) {
538 coff += 1ULL << unit_shift;
540 rm->rm_col[c].rc_devidx = col;
541 rm->rm_col[c].rc_offset = coff;
542 rm->rm_col[c].rc_abd = NULL;
543 rm->rm_col[c].rc_gdata = NULL;
544 rm->rm_col[c].rc_error = 0;
545 rm->rm_col[c].rc_tried = 0;
546 rm->rm_col[c].rc_skipped = 0;
549 rm->rm_col[c].rc_size = 0;
551 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
553 rm->rm_col[c].rc_size = q << unit_shift;
555 asize += rm->rm_col[c].rc_size;
558 ASSERT3U(asize, ==, tot << unit_shift);
559 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
560 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
561 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
562 ASSERT3U(rm->rm_nskip, <=, nparity);
565 for (c = 0; c < rm->rm_firstdatacol; c++) {
566 rm->rm_col[c].rc_abd =
567 abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
570 rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
571 off = rm->rm_col[c].rc_size;
573 for (c = c + 1; c < acols; c++) {
574 rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
575 off += rm->rm_col[c].rc_size;
580 * If all data stored spans all columns, there's a danger that parity
581 * will always be on the same device and, since parity isn't read
582 * during normal operation, that that device's I/O bandwidth won't be
583 * used effectively. We therefore switch the parity every 1MB.
585 * ... at least that was, ostensibly, the theory. As a practical
586 * matter unless we juggle the parity between all devices evenly, we
587 * won't see any benefit. Further, occasional writes that aren't a
588 * multiple of the LCM of the number of children and the minimum
589 * stripe width are sufficient to avoid pessimal behavior.
590 * Unfortunately, this decision created an implicit on-disk format
591 * requirement that we need to support for all eternity, but only
592 * for single-parity RAID-Z.
594 * If we intend to skip a sector in the zeroth column for padding
595 * we must make sure to note this swap. We will never intend to
596 * skip the first column since at least one data and one parity
597 * column must appear in each row.
599 ASSERT(rm->rm_cols >= 2);
600 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
602 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
603 devidx = rm->rm_col[0].rc_devidx;
604 o = rm->rm_col[0].rc_offset;
605 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
606 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
607 rm->rm_col[1].rc_devidx = devidx;
608 rm->rm_col[1].rc_offset = o;
610 if (rm->rm_skipstart == 0)
611 rm->rm_skipstart = 1;
624 vdev_raidz_p_func(void *buf, size_t size, void *private)
626 struct pqr_struct *pqr = private;
627 const uint64_t *src = buf;
628 int i, cnt = size / sizeof (src[0]);
630 ASSERT(pqr->p && !pqr->q && !pqr->r);
632 for (i = 0; i < cnt; i++, src++, pqr->p++)
639 vdev_raidz_pq_func(void *buf, size_t size, void *private)
641 struct pqr_struct *pqr = private;
642 const uint64_t *src = buf;
644 int i, cnt = size / sizeof (src[0]);
646 ASSERT(pqr->p && pqr->q && !pqr->r);
648 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
650 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
658 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
660 struct pqr_struct *pqr = private;
661 const uint64_t *src = buf;
663 int i, cnt = size / sizeof (src[0]);
665 ASSERT(pqr->p && pqr->q && pqr->r);
667 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
669 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
671 VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
679 vdev_raidz_generate_parity_p(raidz_map_t *rm)
685 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
686 src = rm->rm_col[c].rc_abd;
687 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
689 if (c == rm->rm_firstdatacol) {
690 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
692 struct pqr_struct pqr = { p, NULL, NULL };
693 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
694 vdev_raidz_p_func, &pqr);
700 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
702 uint64_t *p, *q, pcnt, ccnt, mask, i;
706 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
707 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
708 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
710 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
711 src = rm->rm_col[c].rc_abd;
712 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
713 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
715 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
717 if (c == rm->rm_firstdatacol) {
718 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
719 (void) memcpy(q, p, rm->rm_col[c].rc_size);
721 struct pqr_struct pqr = { p, q, NULL };
722 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
723 vdev_raidz_pq_func, &pqr);
726 if (c == rm->rm_firstdatacol) {
727 for (i = ccnt; i < pcnt; i++) {
733 * Treat short columns as though they are full of 0s.
734 * Note that there's therefore nothing needed for P.
736 for (i = ccnt; i < pcnt; i++) {
737 VDEV_RAIDZ_64MUL_2(q[i], mask);
744 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
746 uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
750 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
751 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
752 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
753 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
754 rm->rm_col[VDEV_RAIDZ_R].rc_size);
756 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
757 src = rm->rm_col[c].rc_abd;
758 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
759 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
760 r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
762 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
764 if (c == rm->rm_firstdatacol) {
765 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
766 (void) memcpy(q, p, rm->rm_col[c].rc_size);
767 (void) memcpy(r, p, rm->rm_col[c].rc_size);
769 struct pqr_struct pqr = { p, q, r };
770 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
771 vdev_raidz_pqr_func, &pqr);
774 if (c == rm->rm_firstdatacol) {
775 for (i = ccnt; i < pcnt; i++) {
782 * Treat short columns as though they are full of 0s.
783 * Note that there's therefore nothing needed for P.
785 for (i = ccnt; i < pcnt; i++) {
786 VDEV_RAIDZ_64MUL_2(q[i], mask);
787 VDEV_RAIDZ_64MUL_4(r[i], mask);
794 * Generate RAID parity in the first virtual columns according to the number of
795 * parity columns available.
798 vdev_raidz_generate_parity(raidz_map_t *rm)
800 switch (rm->rm_firstdatacol) {
802 vdev_raidz_generate_parity_p(rm);
805 vdev_raidz_generate_parity_pq(rm);
808 vdev_raidz_generate_parity_pqr(rm);
811 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
817 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
819 uint64_t *dst = dbuf;
820 uint64_t *src = sbuf;
821 int cnt = size / sizeof (src[0]);
823 for (int i = 0; i < cnt; i++) {
832 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
835 uint64_t *dst = dbuf;
836 uint64_t *src = sbuf;
838 int cnt = size / sizeof (dst[0]);
840 for (int i = 0; i < cnt; i++, dst++, src++) {
841 VDEV_RAIDZ_64MUL_2(*dst, mask);
850 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
854 int cnt = size / sizeof (dst[0]);
856 for (int i = 0; i < cnt; i++, dst++) {
857 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
858 VDEV_RAIDZ_64MUL_2(*dst, mask);
864 struct reconst_q_struct {
870 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
872 struct reconst_q_struct *rq = private;
874 int cnt = size / sizeof (dst[0]);
876 for (int i = 0; i < cnt; i++, dst++, rq->q++) {
881 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
882 *b = vdev_raidz_exp2(*b, rq->exp);
889 struct reconst_pq_struct {
899 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
901 struct reconst_pq_struct *rpq = private;
905 for (int i = 0; i < size;
906 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
907 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
908 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
909 *yd = *rpq->p ^ *rpq->pxy ^ *xd;
916 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
918 struct reconst_pq_struct *rpq = private;
921 for (int i = 0; i < size;
922 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
923 /* same operation as vdev_raidz_reconst_pq_func() on xd */
924 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
925 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
932 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
939 ASSERT(x >= rm->rm_firstdatacol);
940 ASSERT(x < rm->rm_cols);
942 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
943 ASSERT(rm->rm_col[x].rc_size > 0);
945 src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
946 dst = rm->rm_col[x].rc_abd;
948 abd_copy(dst, src, rm->rm_col[x].rc_size);
950 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
951 uint64_t size = MIN(rm->rm_col[x].rc_size,
952 rm->rm_col[c].rc_size);
954 src = rm->rm_col[c].rc_abd;
955 dst = rm->rm_col[x].rc_abd;
960 (void) abd_iterate_func2(dst, src, 0, 0, size,
961 vdev_raidz_reconst_p_func, NULL);
964 return (1 << VDEV_RAIDZ_P);
968 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
976 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
978 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
979 uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
980 rm->rm_col[c].rc_size);
982 src = rm->rm_col[c].rc_abd;
983 dst = rm->rm_col[x].rc_abd;
985 if (c == rm->rm_firstdatacol) {
986 abd_copy(dst, src, size);
987 if (rm->rm_col[x].rc_size > size)
988 abd_zero_off(dst, size,
989 rm->rm_col[x].rc_size - size);
991 ASSERT3U(size, <=, rm->rm_col[x].rc_size);
992 (void) abd_iterate_func2(dst, src, 0, 0, size,
993 vdev_raidz_reconst_q_pre_func, NULL);
994 (void) abd_iterate_func(dst,
995 size, rm->rm_col[x].rc_size - size,
996 vdev_raidz_reconst_q_pre_tail_func, NULL);
1000 src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1001 dst = rm->rm_col[x].rc_abd;
1002 exp = 255 - (rm->rm_cols - 1 - x);
1004 struct reconst_q_struct rq = { abd_to_buf(src), exp };
1005 (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
1006 vdev_raidz_reconst_q_post_func, &rq);
1008 return (1 << VDEV_RAIDZ_Q);
1012 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
1014 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1015 abd_t *pdata, *qdata;
1016 uint64_t xsize, ysize;
1023 ASSERT(x >= rm->rm_firstdatacol);
1024 ASSERT(y < rm->rm_cols);
1026 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
1029 * Move the parity data aside -- we're going to compute parity as
1030 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1031 * reuse the parity generation mechanism without trashing the actual
1032 * parity so we make those columns appear to be full of zeros by
1033 * setting their lengths to zero.
1035 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
1036 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1037 xsize = rm->rm_col[x].rc_size;
1038 ysize = rm->rm_col[y].rc_size;
1040 rm->rm_col[VDEV_RAIDZ_P].rc_abd =
1041 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1042 rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
1043 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1044 rm->rm_col[x].rc_size = 0;
1045 rm->rm_col[y].rc_size = 0;
1047 vdev_raidz_generate_parity_pq(rm);
1049 rm->rm_col[x].rc_size = xsize;
1050 rm->rm_col[y].rc_size = ysize;
1052 p = abd_to_buf(pdata);
1053 q = abd_to_buf(qdata);
1054 pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1055 qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1056 xd = rm->rm_col[x].rc_abd;
1057 yd = rm->rm_col[y].rc_abd;
1061 * Pxy = P + D_x + D_y
1062 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1064 * We can then solve for D_x:
1065 * D_x = A * (P + Pxy) + B * (Q + Qxy)
1067 * A = 2^(x - y) * (2^(x - y) + 1)^-1
1068 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1070 * With D_x in hand, we can easily solve for D_y:
1071 * D_y = P + Pxy + D_x
1074 a = vdev_raidz_pow2[255 + x - y];
1075 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
1076 tmp = 255 - vdev_raidz_log2[a ^ 1];
1078 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1079 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1081 ASSERT3U(xsize, >=, ysize);
1082 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1083 (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1084 vdev_raidz_reconst_pq_func, &rpq);
1085 (void) abd_iterate_func(xd, ysize, xsize - ysize,
1086 vdev_raidz_reconst_pq_tail_func, &rpq);
1088 abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1089 abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1092 * Restore the saved parity data.
1094 rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
1095 rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1097 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
1102 * In the general case of reconstruction, we must solve the system of linear
1103 * equations defined by the coeffecients used to generate parity as well as
1104 * the contents of the data and parity disks. This can be expressed with
1105 * vectors for the original data (D) and the actual data (d) and parity (p)
1106 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1110 * | V | | D_0 | | p_m-1 |
1111 * | | x | : | = | d_0 |
1112 * | I | | D_n-1 | | : |
1113 * | | ~~ ~~ | d_n-1 |
1116 * I is simply a square identity matrix of size n, and V is a vandermonde
1117 * matrix defined by the coeffecients we chose for the various parity columns
1118 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1119 * computation as well as linear separability.
1122 * | 1 .. 1 1 1 | | p_0 |
1123 * | 2^n-1 .. 4 2 1 | __ __ | : |
1124 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
1125 * | 1 .. 0 0 0 | | D_1 | | d_0 |
1126 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
1127 * | : : : : | | : | | d_2 |
1128 * | 0 .. 1 0 0 | | D_n-1 | | : |
1129 * | 0 .. 0 1 0 | ~~ ~~ | : |
1130 * | 0 .. 0 0 1 | | d_n-1 |
1133 * Note that I, V, d, and p are known. To compute D, we must invert the
1134 * matrix and use the known data and parity values to reconstruct the unknown
1135 * data values. We begin by removing the rows in V|I and d|p that correspond
1136 * to failed or missing columns; we then make V|I square (n x n) and d|p
1137 * sized n by removing rows corresponding to unused parity from the bottom up
1138 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1139 * using Gauss-Jordan elimination. In the example below we use m=3 parity
1140 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1142 * | 1 1 1 1 1 1 1 1 |
1143 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
1144 * | 19 205 116 29 64 16 4 1 | / /
1145 * | 1 0 0 0 0 0 0 0 | / /
1146 * | 0 1 0 0 0 0 0 0 | <--' /
1147 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
1148 * | 0 0 0 1 0 0 0 0 |
1149 * | 0 0 0 0 1 0 0 0 |
1150 * | 0 0 0 0 0 1 0 0 |
1151 * | 0 0 0 0 0 0 1 0 |
1152 * | 0 0 0 0 0 0 0 1 |
1155 * | 1 1 1 1 1 1 1 1 |
1156 * | 19 205 116 29 64 16 4 1 |
1157 * | 1 0 0 0 0 0 0 0 |
1158 * (V|I)' = | 0 0 0 1 0 0 0 0 |
1159 * | 0 0 0 0 1 0 0 0 |
1160 * | 0 0 0 0 0 1 0 0 |
1161 * | 0 0 0 0 0 0 1 0 |
1162 * | 0 0 0 0 0 0 0 1 |
1165 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1166 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1167 * matrix is not singular.
1169 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1170 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1171 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1172 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1173 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1174 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1175 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1176 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1179 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1180 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1181 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1182 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1183 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1184 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1185 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1186 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1189 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1190 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1191 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1192 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1193 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1194 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1195 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1196 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1199 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1200 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1201 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1202 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1203 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1204 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1205 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1206 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1209 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1210 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1211 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1212 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1213 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1214 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1215 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1216 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1219 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1220 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1221 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1222 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1223 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1224 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1225 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1226 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1229 * | 0 0 1 0 0 0 0 0 |
1230 * | 167 100 5 41 159 169 217 208 |
1231 * | 166 100 4 40 158 168 216 209 |
1232 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1233 * | 0 0 0 0 1 0 0 0 |
1234 * | 0 0 0 0 0 1 0 0 |
1235 * | 0 0 0 0 0 0 1 0 |
1236 * | 0 0 0 0 0 0 0 1 |
1239 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1240 * of the missing data.
1242 * As is apparent from the example above, the only non-trivial rows in the
1243 * inverse matrix correspond to the data disks that we're trying to
1244 * reconstruct. Indeed, those are the only rows we need as the others would
1245 * only be useful for reconstructing data known or assumed to be valid. For
1246 * that reason, we only build the coefficients in the rows that correspond to
1252 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1258 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1261 * Fill in the missing rows of interest.
1263 for (i = 0; i < nmap; i++) {
1264 ASSERT3S(0, <=, map[i]);
1265 ASSERT3S(map[i], <=, 2);
1272 for (j = 0; j < n; j++) {
1276 rows[i][j] = vdev_raidz_pow2[pow];
1282 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1283 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1289 * Assert that the first nmissing entries from the array of used
1290 * columns correspond to parity columns and that subsequent entries
1291 * correspond to data columns.
1293 for (i = 0; i < nmissing; i++) {
1294 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1296 for (; i < n; i++) {
1297 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1301 * First initialize the storage where we'll compute the inverse rows.
1303 for (i = 0; i < nmissing; i++) {
1304 for (j = 0; j < n; j++) {
1305 invrows[i][j] = (i == j) ? 1 : 0;
1310 * Subtract all trivial rows from the rows of consequence.
1312 for (i = 0; i < nmissing; i++) {
1313 for (j = nmissing; j < n; j++) {
1314 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1315 jj = used[j] - rm->rm_firstdatacol;
1317 invrows[i][j] = rows[i][jj];
1323 * For each of the rows of interest, we must normalize it and subtract
1324 * a multiple of it from the other rows.
1326 for (i = 0; i < nmissing; i++) {
1327 for (j = 0; j < missing[i]; j++) {
1328 ASSERT0(rows[i][j]);
1330 ASSERT3U(rows[i][missing[i]], !=, 0);
1333 * Compute the inverse of the first element and multiply each
1334 * element in the row by that value.
1336 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1338 for (j = 0; j < n; j++) {
1339 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1340 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1343 for (ii = 0; ii < nmissing; ii++) {
1347 ASSERT3U(rows[ii][missing[i]], !=, 0);
1349 log = vdev_raidz_log2[rows[ii][missing[i]]];
1351 for (j = 0; j < n; j++) {
1353 vdev_raidz_exp2(rows[i][j], log);
1355 vdev_raidz_exp2(invrows[i][j], log);
1361 * Verify that the data that is left in the rows are properly part of
1362 * an identity matrix.
1364 for (i = 0; i < nmissing; i++) {
1365 for (j = 0; j < n; j++) {
1366 if (j == missing[i]) {
1367 ASSERT3U(rows[i][j], ==, 1);
1369 ASSERT0(rows[i][j]);
1376 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1377 int *missing, uint8_t **invrows, const uint8_t *used)
1382 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1383 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1387 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1391 psize = sizeof (invlog[0][0]) * n * nmissing;
1392 p = kmem_alloc(psize, KM_SLEEP);
1394 for (pp = p, i = 0; i < nmissing; i++) {
1399 for (i = 0; i < nmissing; i++) {
1400 for (j = 0; j < n; j++) {
1401 ASSERT3U(invrows[i][j], !=, 0);
1402 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1406 for (i = 0; i < n; i++) {
1408 ASSERT3U(c, <, rm->rm_cols);
1410 src = abd_to_buf(rm->rm_col[c].rc_abd);
1411 ccount = rm->rm_col[c].rc_size;
1412 for (j = 0; j < nmissing; j++) {
1413 cc = missing[j] + rm->rm_firstdatacol;
1414 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1415 ASSERT3U(cc, <, rm->rm_cols);
1416 ASSERT3U(cc, !=, c);
1418 dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
1419 dcount[j] = rm->rm_col[cc].rc_size;
1422 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1424 for (x = 0; x < ccount; x++, src++) {
1426 log = vdev_raidz_log2[*src];
1428 for (cc = 0; cc < nmissing; cc++) {
1429 if (x >= dcount[cc])
1435 if ((ll = log + invlog[cc][i]) >= 255)
1437 val = vdev_raidz_pow2[ll];
1448 kmem_free(p, psize);
1452 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1456 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1457 int parity_map[VDEV_RAIDZ_MAXPARITY];
1462 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1463 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1466 abd_t **bufs = NULL;
1471 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1472 * temporary linear ABDs.
1474 if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
1475 bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
1477 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1478 raidz_col_t *col = &rm->rm_col[c];
1480 bufs[c] = col->rc_abd;
1481 col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
1482 abd_copy(col->rc_abd, bufs[c], col->rc_size);
1486 n = rm->rm_cols - rm->rm_firstdatacol;
1489 * Figure out which data columns are missing.
1492 for (t = 0; t < ntgts; t++) {
1493 if (tgts[t] >= rm->rm_firstdatacol) {
1494 missing_rows[nmissing_rows++] =
1495 tgts[t] - rm->rm_firstdatacol;
1500 * Figure out which parity columns to use to help generate the missing
1503 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1505 ASSERT(c < rm->rm_firstdatacol);
1508 * Skip any targeted parity columns.
1510 if (c == tgts[tt]) {
1522 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1524 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1525 nmissing_rows * n + sizeof (used[0]) * n;
1526 p = kmem_alloc(psize, KM_SLEEP);
1528 for (pp = p, i = 0; i < nmissing_rows; i++) {
1536 for (i = 0; i < nmissing_rows; i++) {
1537 used[i] = parity_map[i];
1540 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1541 if (tt < nmissing_rows &&
1542 c == missing_rows[tt] + rm->rm_firstdatacol) {
1553 * Initialize the interesting rows of the matrix.
1555 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1558 * Invert the matrix.
1560 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1564 * Reconstruct the missing data using the generated matrix.
1566 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1569 kmem_free(p, psize);
1572 * copy back from temporary linear abds and free them
1575 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1576 raidz_col_t *col = &rm->rm_col[c];
1578 abd_copy(bufs[c], col->rc_abd, col->rc_size);
1579 abd_free(col->rc_abd);
1580 col->rc_abd = bufs[c];
1582 kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
1589 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1591 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1595 int nbadparity, nbaddata;
1596 int parity_valid[VDEV_RAIDZ_MAXPARITY];
1599 * The tgts list must already be sorted.
1601 for (i = 1; i < nt; i++) {
1602 ASSERT(t[i] > t[i - 1]);
1605 nbadparity = rm->rm_firstdatacol;
1606 nbaddata = rm->rm_cols - nbadparity;
1608 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1609 if (c < rm->rm_firstdatacol)
1610 parity_valid[c] = B_FALSE;
1612 if (i < nt && c == t[i]) {
1615 } else if (rm->rm_col[c].rc_error != 0) {
1617 } else if (c >= rm->rm_firstdatacol) {
1620 parity_valid[c] = B_TRUE;
1625 ASSERT(ntgts >= nt);
1626 ASSERT(nbaddata >= 0);
1627 ASSERT(nbaddata + nbadparity == ntgts);
1629 dt = &tgts[nbadparity];
1632 * See if we can use any of our optimized reconstruction routines.
1634 if (!vdev_raidz_default_to_general) {
1637 if (parity_valid[VDEV_RAIDZ_P])
1638 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1640 ASSERT(rm->rm_firstdatacol > 1);
1642 if (parity_valid[VDEV_RAIDZ_Q])
1643 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1645 ASSERT(rm->rm_firstdatacol > 2);
1649 ASSERT(rm->rm_firstdatacol > 1);
1651 if (parity_valid[VDEV_RAIDZ_P] &&
1652 parity_valid[VDEV_RAIDZ_Q])
1653 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1655 ASSERT(rm->rm_firstdatacol > 2);
1661 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1662 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1668 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1669 uint64_t *logical_ashift, uint64_t *physical_ashift)
1672 uint64_t nparity = vd->vdev_nparity;
1677 ASSERT(nparity > 0);
1679 if (nparity > VDEV_RAIDZ_MAXPARITY ||
1680 vd->vdev_children < nparity + 1) {
1681 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1682 return (SET_ERROR(EINVAL));
1685 vdev_open_children(vd);
1687 for (c = 0; c < vd->vdev_children; c++) {
1688 cvd = vd->vdev_child[c];
1690 if (cvd->vdev_open_error != 0) {
1691 lasterror = cvd->vdev_open_error;
1696 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1697 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1698 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1699 *physical_ashift = MAX(*physical_ashift,
1700 cvd->vdev_physical_ashift);
1703 *asize *= vd->vdev_children;
1704 *max_asize *= vd->vdev_children;
1706 if (numerrors > nparity) {
1707 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1715 vdev_raidz_close(vdev_t *vd)
1719 for (c = 0; c < vd->vdev_children; c++)
1720 vdev_close(vd->vdev_child[c]);
1725 * Handle a read or write I/O to a RAID-Z dump device.
1727 * The dump device is in a unique situation compared to other ZFS datasets:
1728 * writing to this device should be as simple and fast as possible. In
1729 * addition, durability matters much less since the dump will be extracted
1730 * once the machine reboots. For that reason, this function eschews parity for
1731 * performance and simplicity. The dump device uses the checksum setting
1732 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1735 * Blocks of size 128 KB have been preallocated for this volume. I/Os less than
1736 * 128 KB will not fill an entire block; in addition, they may not be properly
1737 * aligned. In that case, this function uses the preallocated 128 KB block and
1738 * omits reading or writing any "empty" portions of that block, as opposed to
1739 * allocating a fresh appropriately-sized block.
1741 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1743 * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1745 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1746 * allocated which spans all five child vdevs. 8 KB of data would be written to
1747 * each of four vdevs, with the fifth containing the parity bits.
1749 * parity data data data data
1750 * | PP | XX | XX | XX | XX |
1753 * 8 KB parity ------8 KB data blocks------
1755 * However, when writing to the dump device, the behavior is different:
1757 * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1759 * Unlike the normal RAID-Z case in which the block is allocated based on the
1760 * I/O size, reads and writes here always use a 128 KB logical I/O size. If the
1761 * I/O size is less than 128 KB, only the actual portions of data are written.
1762 * In this example the data is written to the third data vdev since that vdev
1763 * contains the offset [64 KB, 96 KB).
1765 * parity data data data data
1771 * As a result, an individual I/O may not span all child vdevs; moreover, a
1772 * small I/O may only operate on a single child vdev.
1774 * Note that since there are no parity bits calculated or written, this format
1775 * remains the same no matter how many parity bits are used in a normal RAID-Z
1776 * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above
1779 * parity parity parity data data data data
1780 * | | | | | | XX | |
1786 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1787 uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1789 vdev_t *tvd = vd->vdev_top;
1795 uint64_t start, end, colstart, colend;
1796 uint64_t coloffset, colsize, colskip;
1798 int flags = doread ? BIO_READ : BIO_WRITE;
1803 * Don't write past the end of the block
1805 VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1811 * Allocate a RAID-Z map for this block. Note that this block starts
1812 * from the "original" offset, this is, the offset of the extent which
1813 * contains the requisite offset of the data being read or written.
1815 * Even if this I/O operation doesn't span the full block size, let's
1816 * treat the on-disk format as if the only blocks are the complete 128
1819 abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
1820 SPA_OLD_MAXBLOCKSIZE);
1821 rm = vdev_raidz_map_alloc(abd,
1822 SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
1823 vd->vdev_children, vd->vdev_nparity);
1825 coloffset = origoffset;
1827 for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1828 c++, coloffset += rc->rc_size) {
1829 rc = &rm->rm_col[c];
1830 cvd = vd->vdev_child[rc->rc_devidx];
1833 * Find the start and end of this column in the RAID-Z map,
1834 * keeping in mind that the stated size and offset of the
1835 * operation may not fill the entire column for this vdev.
1837 * If any portion of the data spans this column, issue the
1838 * appropriate operation to the vdev.
1840 if (coloffset + rc->rc_size <= start)
1842 if (coloffset >= end)
1845 colstart = MAX(coloffset, start);
1846 colend = MIN(end, coloffset + rc->rc_size);
1847 colsize = colend - colstart;
1848 colskip = colstart - coloffset;
1850 VERIFY3U(colsize, <=, rc->rc_size);
1851 VERIFY3U(colskip, <=, rc->rc_size);
1854 * Note that the child vdev will have a vdev label at the start
1855 * of its range of offsets, hence the need for
1856 * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another
1857 * example of why this calculation is needed.
1859 if ((err = vdev_disk_physio(cvd,
1860 ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize,
1861 VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1862 flags, isdump)) != 0)
1866 vdev_raidz_map_free(rm);
1875 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1878 uint64_t ashift = vd->vdev_top->vdev_ashift;
1879 uint64_t cols = vd->vdev_children;
1880 uint64_t nparity = vd->vdev_nparity;
1882 asize = ((psize - 1) >> ashift) + 1;
1883 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1884 asize = roundup(asize, nparity + 1) << ashift;
1890 vdev_raidz_child_done(zio_t *zio)
1892 raidz_col_t *rc = zio->io_private;
1894 rc->rc_error = zio->io_error;
1900 * Start an IO operation on a RAIDZ VDev
1903 * - For write operations:
1904 * 1. Generate the parity data
1905 * 2. Create child zio write operations to each column's vdev, for both
1907 * 3. If the column skips any sectors for padding, create optional dummy
1908 * write zio children for those areas to improve aggregation continuity.
1909 * - For read operations:
1910 * 1. Create child zio read operations to each data column's vdev to read
1911 * the range of data required for zio.
1912 * 2. If this is a scrub or resilver operation, or if any of the data
1913 * vdevs have had errors, then create zio read operations to the parity
1914 * columns' VDevs as well.
1917 vdev_raidz_io_start(zio_t *zio)
1919 vdev_t *vd = zio->io_vd;
1920 vdev_t *tvd = vd->vdev_top;
1926 rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
1927 zio->io_type == ZIO_TYPE_FREE,
1928 tvd->vdev_ashift, vd->vdev_children,
1932 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1934 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1936 if (zio->io_type == ZIO_TYPE_FREE) {
1937 for (c = 0; c < rm->rm_cols; c++) {
1938 rc = &rm->rm_col[c];
1939 cvd = vd->vdev_child[rc->rc_devidx];
1940 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1941 rc->rc_offset, rc->rc_abd, rc->rc_size,
1942 zio->io_type, zio->io_priority, 0,
1943 vdev_raidz_child_done, rc));
1950 if (zio->io_type == ZIO_TYPE_WRITE) {
1951 vdev_raidz_generate_parity(rm);
1953 for (c = 0; c < rm->rm_cols; c++) {
1954 rc = &rm->rm_col[c];
1955 cvd = vd->vdev_child[rc->rc_devidx];
1956 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1957 rc->rc_offset, rc->rc_abd, rc->rc_size,
1958 zio->io_type, zio->io_priority, 0,
1959 vdev_raidz_child_done, rc));
1963 * Generate optional I/Os for any skipped sectors to improve
1964 * aggregation contiguity.
1966 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1967 ASSERT(c <= rm->rm_scols);
1968 if (c == rm->rm_scols)
1970 rc = &rm->rm_col[c];
1971 cvd = vd->vdev_child[rc->rc_devidx];
1972 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1973 rc->rc_offset + rc->rc_size, NULL,
1974 1 << tvd->vdev_ashift,
1975 zio->io_type, zio->io_priority,
1976 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1983 ASSERT(zio->io_type == ZIO_TYPE_READ);
1986 * Iterate over the columns in reverse order so that we hit the parity
1987 * last -- any errors along the way will force us to read the parity.
1989 for (c = rm->rm_cols - 1; c >= 0; c--) {
1990 rc = &rm->rm_col[c];
1991 cvd = vd->vdev_child[rc->rc_devidx];
1992 if (!vdev_readable(cvd)) {
1993 if (c >= rm->rm_firstdatacol)
1994 rm->rm_missingdata++;
1996 rm->rm_missingparity++;
1997 rc->rc_error = SET_ERROR(ENXIO);
1998 rc->rc_tried = 1; /* don't even try */
2002 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2003 if (c >= rm->rm_firstdatacol)
2004 rm->rm_missingdata++;
2006 rm->rm_missingparity++;
2007 rc->rc_error = SET_ERROR(ESTALE);
2011 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
2012 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
2013 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2014 rc->rc_offset, rc->rc_abd, rc->rc_size,
2015 zio->io_type, zio->io_priority, 0,
2016 vdev_raidz_child_done, rc));
2025 * Report a checksum error for a child of a RAID-Z device.
2028 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
2031 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2033 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2034 zio_bad_cksum_t zbc;
2035 raidz_map_t *rm = zio->io_vsd;
2037 mutex_enter(&vd->vdev_stat_lock);
2038 vd->vdev_stat.vs_checksum_errors++;
2039 mutex_exit(&vd->vdev_stat_lock);
2041 zbc.zbc_has_cksum = 0;
2042 zbc.zbc_injected = rm->rm_ecksuminjected;
2044 buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
2045 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
2046 rc->rc_offset, rc->rc_size, buf, bad_data,
2048 abd_return_buf(rc->rc_abd, buf, rc->rc_size);
2053 * We keep track of whether or not there were any injected errors, so that
2054 * any ereports we generate can note it.
2057 raidz_checksum_verify(zio_t *zio)
2059 zio_bad_cksum_t zbc;
2060 raidz_map_t *rm = zio->io_vsd;
2062 int ret = zio_checksum_error(zio, &zbc);
2063 if (ret != 0 && zbc.zbc_injected != 0)
2064 rm->rm_ecksuminjected = 1;
2070 * Generate the parity from the data columns. If we tried and were able to
2071 * read the parity without error, verify that the generated parity matches the
2072 * data we read. If it doesn't, we fire off a checksum error. Return the
2073 * number such failures.
2076 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
2078 void *orig[VDEV_RAIDZ_MAXPARITY];
2082 blkptr_t *bp = zio->io_bp;
2083 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2084 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2086 if (checksum == ZIO_CHECKSUM_NOPARITY)
2089 for (c = 0; c < rm->rm_firstdatacol; c++) {
2090 rc = &rm->rm_col[c];
2091 if (!rc->rc_tried || rc->rc_error != 0)
2093 orig[c] = zio_buf_alloc(rc->rc_size);
2094 abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
2097 vdev_raidz_generate_parity(rm);
2099 for (c = 0; c < rm->rm_firstdatacol; c++) {
2100 rc = &rm->rm_col[c];
2101 if (!rc->rc_tried || rc->rc_error != 0)
2103 if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) {
2104 raidz_checksum_error(zio, rc, orig[c]);
2105 rc->rc_error = SET_ERROR(ECKSUM);
2108 zio_buf_free(orig[c], rc->rc_size);
2115 * Keep statistics on all the ways that we used parity to correct data.
2117 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
2120 vdev_raidz_worst_error(raidz_map_t *rm)
2124 for (int c = 0; c < rm->rm_cols; c++)
2125 error = zio_worst_error(error, rm->rm_col[c].rc_error);
2131 * Iterate over all combinations of bad data and attempt a reconstruction.
2132 * Note that the algorithm below is non-optimal because it doesn't take into
2133 * account how reconstruction is actually performed. For example, with
2134 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
2135 * is targeted as invalid as if columns 1 and 4 are targeted since in both
2136 * cases we'd only use parity information in column 0.
2139 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
2141 raidz_map_t *rm = zio->io_vsd;
2143 void *orig[VDEV_RAIDZ_MAXPARITY];
2144 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
2145 int *tgts = &tstore[1];
2146 int current, next, i, c, n;
2149 ASSERT(total_errors < rm->rm_firstdatacol);
2152 * This simplifies one edge condition.
2156 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
2158 * Initialize the targets array by finding the first n columns
2159 * that contain no error.
2161 * If there were no data errors, we need to ensure that we're
2162 * always explicitly attempting to reconstruct at least one
2163 * data column. To do this, we simply push the highest target
2164 * up into the data columns.
2166 for (c = 0, i = 0; i < n; i++) {
2167 if (i == n - 1 && data_errors == 0 &&
2168 c < rm->rm_firstdatacol) {
2169 c = rm->rm_firstdatacol;
2172 while (rm->rm_col[c].rc_error != 0) {
2174 ASSERT3S(c, <, rm->rm_cols);
2181 * Setting tgts[n] simplifies the other edge condition.
2183 tgts[n] = rm->rm_cols;
2186 * These buffers were allocated in previous iterations.
2188 for (i = 0; i < n - 1; i++) {
2189 ASSERT(orig[i] != NULL);
2192 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2195 next = tgts[current];
2197 while (current != n) {
2198 tgts[current] = next;
2202 * Save off the original data that we're going to
2203 * attempt to reconstruct.
2205 for (i = 0; i < n; i++) {
2206 ASSERT(orig[i] != NULL);
2209 ASSERT3S(c, <, rm->rm_cols);
2210 rc = &rm->rm_col[c];
2211 abd_copy_to_buf(orig[i], rc->rc_abd,
2216 * Attempt a reconstruction and exit the outer loop on
2219 code = vdev_raidz_reconstruct(rm, tgts, n);
2220 if (raidz_checksum_verify(zio) == 0) {
2221 atomic_inc_64(&raidz_corrected[code]);
2223 for (i = 0; i < n; i++) {
2225 rc = &rm->rm_col[c];
2226 ASSERT(rc->rc_error == 0);
2228 raidz_checksum_error(zio, rc,
2230 rc->rc_error = SET_ERROR(ECKSUM);
2238 * Restore the original data.
2240 for (i = 0; i < n; i++) {
2242 rc = &rm->rm_col[c];
2243 abd_copy_from_buf(rc->rc_abd, orig[i],
2249 * Find the next valid column after the current
2252 for (next = tgts[current] + 1;
2253 next < rm->rm_cols &&
2254 rm->rm_col[next].rc_error != 0; next++)
2257 ASSERT(next <= tgts[current + 1]);
2260 * If that spot is available, we're done here.
2262 if (next != tgts[current + 1])
2266 * Otherwise, find the next valid column after
2267 * the previous position.
2269 for (c = tgts[current - 1] + 1;
2270 rm->rm_col[c].rc_error != 0; c++)
2276 } while (current != n);
2281 for (i = 0; i < n; i++) {
2282 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2289 * Complete an IO operation on a RAIDZ VDev
2292 * - For write operations:
2293 * 1. Check for errors on the child IOs.
2294 * 2. Return, setting an error code if too few child VDevs were written
2295 * to reconstruct the data later. Note that partial writes are
2296 * considered successful if they can be reconstructed at all.
2297 * - For read operations:
2298 * 1. Check for errors on the child IOs.
2299 * 2. If data errors occurred:
2300 * a. Try to reassemble the data from the parity available.
2301 * b. If we haven't yet read the parity drives, read them now.
2302 * c. If all parity drives have been read but the data still doesn't
2303 * reassemble with a correct checksum, then try combinatorial
2305 * d. If that doesn't work, return an error.
2306 * 3. If there were unexpected errors or this is a resilver operation,
2307 * rewrite the vdevs that had errors.
2310 vdev_raidz_io_done(zio_t *zio)
2312 vdev_t *vd = zio->io_vd;
2314 raidz_map_t *rm = zio->io_vsd;
2316 int unexpected_errors = 0;
2317 int parity_errors = 0;
2318 int parity_untried = 0;
2319 int data_errors = 0;
2320 int total_errors = 0;
2322 int tgts[VDEV_RAIDZ_MAXPARITY];
2325 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
2327 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2328 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2330 for (c = 0; c < rm->rm_cols; c++) {
2331 rc = &rm->rm_col[c];
2334 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
2336 if (c < rm->rm_firstdatacol)
2341 if (!rc->rc_skipped)
2342 unexpected_errors++;
2345 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2350 if (zio->io_type == ZIO_TYPE_WRITE) {
2352 * XXX -- for now, treat partial writes as a success.
2353 * (If we couldn't write enough columns to reconstruct
2354 * the data, the I/O failed. Otherwise, good enough.)
2356 * Now that we support write reallocation, it would be better
2357 * to treat partial failure as real failure unless there are
2358 * no non-degraded top-level vdevs left, and not update DTLs
2359 * if we intend to reallocate.
2362 if (total_errors > rm->rm_firstdatacol)
2363 zio->io_error = vdev_raidz_worst_error(rm);
2366 } else if (zio->io_type == ZIO_TYPE_FREE) {
2370 ASSERT(zio->io_type == ZIO_TYPE_READ);
2372 * There are three potential phases for a read:
2373 * 1. produce valid data from the columns read
2374 * 2. read all disks and try again
2375 * 3. perform combinatorial reconstruction
2377 * Each phase is progressively both more expensive and less likely to
2378 * occur. If we encounter more errors than we can repair or all phases
2379 * fail, we have no choice but to return an error.
2383 * If the number of errors we saw was correctable -- less than or equal
2384 * to the number of parity disks read -- attempt to produce data that
2385 * has a valid checksum. Naturally, this case applies in the absence of
2388 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2389 if (data_errors == 0) {
2390 if (raidz_checksum_verify(zio) == 0) {
2392 * If we read parity information (unnecessarily
2393 * as it happens since no reconstruction was
2394 * needed) regenerate and verify the parity.
2395 * We also regenerate parity when resilvering
2396 * so we can write it out to the failed device
2399 if (parity_errors + parity_untried <
2400 rm->rm_firstdatacol ||
2401 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2402 n = raidz_parity_verify(zio, rm);
2403 unexpected_errors += n;
2404 ASSERT(parity_errors + n <=
2405 rm->rm_firstdatacol);
2411 * We either attempt to read all the parity columns or
2412 * none of them. If we didn't try to read parity, we
2413 * wouldn't be here in the correctable case. There must
2414 * also have been fewer parity errors than parity
2415 * columns or, again, we wouldn't be in this code path.
2417 ASSERT(parity_untried == 0);
2418 ASSERT(parity_errors < rm->rm_firstdatacol);
2421 * Identify the data columns that reported an error.
2424 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2425 rc = &rm->rm_col[c];
2426 if (rc->rc_error != 0) {
2427 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2432 ASSERT(rm->rm_firstdatacol >= n);
2434 code = vdev_raidz_reconstruct(rm, tgts, n);
2436 if (raidz_checksum_verify(zio) == 0) {
2437 atomic_inc_64(&raidz_corrected[code]);
2440 * If we read more parity disks than were used
2441 * for reconstruction, confirm that the other
2442 * parity disks produced correct data. This
2443 * routine is suboptimal in that it regenerates
2444 * the parity that we already used in addition
2445 * to the parity that we're attempting to
2446 * verify, but this should be a relatively
2447 * uncommon case, and can be optimized if it
2448 * becomes a problem. Note that we regenerate
2449 * parity when resilvering so we can write it
2450 * out to failed devices later.
2452 if (parity_errors < rm->rm_firstdatacol - n ||
2453 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2454 n = raidz_parity_verify(zio, rm);
2455 unexpected_errors += n;
2456 ASSERT(parity_errors + n <=
2457 rm->rm_firstdatacol);
2466 * This isn't a typical situation -- either we got a read error or
2467 * a child silently returned bad data. Read every block so we can
2468 * try again with as much data and parity as we can track down. If
2469 * we've already been through once before, all children will be marked
2470 * as tried so we'll proceed to combinatorial reconstruction.
2472 unexpected_errors = 1;
2473 rm->rm_missingdata = 0;
2474 rm->rm_missingparity = 0;
2476 for (c = 0; c < rm->rm_cols; c++) {
2477 if (rm->rm_col[c].rc_tried)
2480 zio_vdev_io_redone(zio);
2482 rc = &rm->rm_col[c];
2485 zio_nowait(zio_vdev_child_io(zio, NULL,
2486 vd->vdev_child[rc->rc_devidx],
2487 rc->rc_offset, rc->rc_abd, rc->rc_size,
2488 zio->io_type, zio->io_priority, 0,
2489 vdev_raidz_child_done, rc));
2490 } while (++c < rm->rm_cols);
2496 * At this point we've attempted to reconstruct the data given the
2497 * errors we detected, and we've attempted to read all columns. There
2498 * must, therefore, be one or more additional problems -- silent errors
2499 * resulting in invalid data rather than explicit I/O errors resulting
2500 * in absent data. We check if there is enough additional data to
2501 * possibly reconstruct the data and then perform combinatorial
2502 * reconstruction over all possible combinations. If that fails,
2505 if (total_errors > rm->rm_firstdatacol) {
2506 zio->io_error = vdev_raidz_worst_error(rm);
2508 } else if (total_errors < rm->rm_firstdatacol &&
2509 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2511 * If we didn't use all the available parity for the
2512 * combinatorial reconstruction, verify that the remaining
2513 * parity is correct.
2515 if (code != (1 << rm->rm_firstdatacol) - 1)
2516 (void) raidz_parity_verify(zio, rm);
2519 * We're here because either:
2521 * total_errors == rm_first_datacol, or
2522 * vdev_raidz_combrec() failed
2524 * In either case, there is enough bad data to prevent
2527 * Start checksum ereports for all children which haven't
2528 * failed, and the IO wasn't speculative.
2530 zio->io_error = SET_ERROR(ECKSUM);
2532 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2533 for (c = 0; c < rm->rm_cols; c++) {
2534 rc = &rm->rm_col[c];
2535 if (rc->rc_error == 0) {
2536 zio_bad_cksum_t zbc;
2537 zbc.zbc_has_cksum = 0;
2539 rm->rm_ecksuminjected;
2541 zfs_ereport_start_checksum(
2543 vd->vdev_child[rc->rc_devidx],
2544 zio, rc->rc_offset, rc->rc_size,
2545 (void *)(uintptr_t)c, &zbc);
2552 zio_checksum_verified(zio);
2554 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2555 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2557 * Use the good data we have in hand to repair damaged children.
2559 for (c = 0; c < rm->rm_cols; c++) {
2560 rc = &rm->rm_col[c];
2561 cvd = vd->vdev_child[rc->rc_devidx];
2563 if (rc->rc_error == 0)
2566 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2567 rc->rc_offset, rc->rc_abd, rc->rc_size,
2568 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2569 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2570 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2576 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2578 if (faulted > vd->vdev_nparity)
2579 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2580 VDEV_AUX_NO_REPLICAS);
2581 else if (degraded + faulted != 0)
2582 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2584 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2587 vdev_ops_t vdev_raidz_ops = {
2591 vdev_raidz_io_start,
2593 vdev_raidz_state_change,
2597 VDEV_TYPE_RAIDZ, /* name of this vdev type */
2598 B_FALSE /* not a leaf vdev */