4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26 * Copyright (c) 2014 Integros [integros.com]
29 #include <sys/zfs_context.h>
31 #include <sys/vdev_impl.h>
33 #include <sys/vdev_disk.h>
35 #include <sys/vdev_file.h>
36 #include <sys/vdev_raidz.h>
38 #include <sys/zio_checksum.h>
40 #include <sys/fs/zfs.h>
41 #include <sys/fm/fs/zfs.h>
45 #include <sys/vdev_initialize.h> /* vdev_xlate testing */
49 * Virtual device vector for RAID-Z.
51 * This vdev supports single, double, and triple parity. For single parity,
52 * we use a simple XOR of all the data columns. For double or triple parity,
53 * we use a special case of Reed-Solomon coding. This extends the
54 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
55 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
56 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
57 * former is also based. The latter is designed to provide higher performance
60 * Note that the Plank paper claimed to support arbitrary N+M, but was then
61 * amended six years later identifying a critical flaw that invalidates its
62 * claims. Nevertheless, the technique can be adapted to work for up to
63 * triple parity. For additional parity, the amendment "Note: Correction to
64 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
65 * is viable, but the additional complexity means that write performance will
68 * All of the methods above operate on a Galois field, defined over the
69 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
70 * can be expressed with a single byte. Briefly, the operations on the
71 * field are defined as follows:
73 * o addition (+) is represented by a bitwise XOR
74 * o subtraction (-) is therefore identical to addition: A + B = A - B
75 * o multiplication of A by 2 is defined by the following bitwise expression:
80 * (A * 2)_4 = A_3 + A_7
81 * (A * 2)_3 = A_2 + A_7
82 * (A * 2)_2 = A_1 + A_7
86 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
87 * As an aside, this multiplication is derived from the error correcting
88 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
90 * Observe that any number in the field (except for 0) can be expressed as a
91 * power of 2 -- a generator for the field. We store a table of the powers of
92 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
93 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
94 * than field addition). The inverse of a field element A (A^-1) is therefore
95 * A ^ (255 - 1) = A^254.
97 * The up-to-three parity columns, P, Q, R over several data columns,
98 * D_0, ... D_n-1, can be expressed by field operations:
100 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
101 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
102 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
103 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
104 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
106 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
107 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
108 * independent coefficients. (There are no additional coefficients that have
109 * this property which is why the uncorrected Plank method breaks down.)
111 * See the reconstruction code below for how P, Q and R can used individually
112 * or in concert to recover missing data columns.
115 typedef struct raidz_col {
116 uint64_t rc_devidx; /* child device index for I/O */
117 uint64_t rc_offset; /* device offset */
118 uint64_t rc_size; /* I/O size */
119 abd_t *rc_abd; /* I/O data */
120 void *rc_gdata; /* used to store the "good" version */
121 int rc_error; /* I/O error for this device */
122 uint8_t rc_tried; /* Did we attempt this I/O column? */
123 uint8_t rc_skipped; /* Did we skip this I/O column? */
126 typedef struct raidz_map {
127 uint64_t rm_cols; /* Regular column count */
128 uint64_t rm_scols; /* Count including skipped columns */
129 uint64_t rm_bigcols; /* Number of oversized columns */
130 uint64_t rm_asize; /* Actual total I/O size */
131 uint64_t rm_missingdata; /* Count of missing data devices */
132 uint64_t rm_missingparity; /* Count of missing parity devices */
133 uint64_t rm_firstdatacol; /* First data column/parity count */
134 uint64_t rm_nskip; /* Skipped sectors for padding */
135 uint64_t rm_skipstart; /* Column index of padding start */
136 abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */
137 uintptr_t rm_reports; /* # of referencing checksum reports */
138 uint8_t rm_freed; /* map no longer has referencing ZIO */
139 uint8_t rm_ecksuminjected; /* checksum error was injected */
140 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
143 #define VDEV_RAIDZ_P 0
144 #define VDEV_RAIDZ_Q 1
145 #define VDEV_RAIDZ_R 2
147 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
148 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
151 * We provide a mechanism to perform the field multiplication operation on a
152 * 64-bit value all at once rather than a byte at a time. This works by
153 * creating a mask from the top bit in each byte and using that to
154 * conditionally apply the XOR of 0x1d.
156 #define VDEV_RAIDZ_64MUL_2(x, mask) \
158 (mask) = (x) & 0x8080808080808080ULL; \
159 (mask) = ((mask) << 1) - ((mask) >> 7); \
160 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
161 ((mask) & 0x1d1d1d1d1d1d1d1d); \
164 #define VDEV_RAIDZ_64MUL_4(x, mask) \
166 VDEV_RAIDZ_64MUL_2((x), mask); \
167 VDEV_RAIDZ_64MUL_2((x), mask); \
170 #define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE)
173 * Force reconstruction to use the general purpose method.
175 int vdev_raidz_default_to_general;
177 /* Powers of 2 in the Galois field defined above. */
178 static const uint8_t vdev_raidz_pow2[256] = {
179 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
180 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
181 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
182 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
183 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
184 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
185 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
186 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
187 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
188 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
189 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
190 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
191 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
192 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
193 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
194 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
195 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
196 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
197 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
198 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
199 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
200 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
201 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
202 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
203 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
204 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
205 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
206 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
207 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
208 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
209 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
210 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
212 /* Logs of 2 in the Galois field defined above. */
213 static const uint8_t vdev_raidz_log2[256] = {
214 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
215 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
216 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
217 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
218 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
219 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
220 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
221 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
222 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
223 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
224 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
225 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
226 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
227 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
228 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
229 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
230 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
231 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
232 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
233 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
234 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
235 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
236 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
237 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
238 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
239 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
240 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
241 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
242 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
243 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
244 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
245 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
248 static void vdev_raidz_generate_parity(raidz_map_t *rm);
251 * Multiply a given number by 2 raised to the given power.
254 vdev_raidz_exp2(uint_t a, int exp)
260 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
262 exp += vdev_raidz_log2[a];
266 return (vdev_raidz_pow2[exp]);
270 vdev_raidz_map_free(raidz_map_t *rm)
275 for (c = 0; c < rm->rm_firstdatacol; c++) {
276 if (rm->rm_col[c].rc_abd != NULL)
277 abd_free(rm->rm_col[c].rc_abd);
279 if (rm->rm_col[c].rc_gdata != NULL)
280 zio_buf_free(rm->rm_col[c].rc_gdata,
281 rm->rm_col[c].rc_size);
285 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
286 if (rm->rm_col[c].rc_abd != NULL)
287 abd_put(rm->rm_col[c].rc_abd);
288 size += rm->rm_col[c].rc_size;
291 if (rm->rm_abd_copy != NULL)
292 abd_free(rm->rm_abd_copy);
294 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
298 vdev_raidz_map_free_vsd(zio_t *zio)
300 raidz_map_t *rm = zio->io_vsd;
302 ASSERT0(rm->rm_freed);
305 if (rm->rm_reports == 0)
306 vdev_raidz_map_free(rm);
311 vdev_raidz_cksum_free(void *arg, size_t ignored)
313 raidz_map_t *rm = arg;
315 ASSERT3U(rm->rm_reports, >, 0);
317 if (--rm->rm_reports == 0 && rm->rm_freed != 0)
318 vdev_raidz_map_free(rm);
322 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
324 raidz_map_t *rm = zcr->zcr_cbdata;
325 size_t c = zcr->zcr_cbinfo;
328 const char *good = NULL;
331 if (good_data == NULL) {
332 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
336 if (c < rm->rm_firstdatacol) {
338 * The first time through, calculate the parity blocks for
339 * the good data (this relies on the fact that the good
340 * data never changes for a given logical ZIO)
342 if (rm->rm_col[0].rc_gdata == NULL) {
343 abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
348 * Set up the rm_col[]s to generate the parity for
349 * good_data, first saving the parity bufs and
350 * replacing them with buffers to hold the result.
352 for (x = 0; x < rm->rm_firstdatacol; x++) {
353 bad_parity[x] = rm->rm_col[x].rc_abd;
354 rm->rm_col[x].rc_gdata =
355 zio_buf_alloc(rm->rm_col[x].rc_size);
356 rm->rm_col[x].rc_abd =
357 abd_get_from_buf(rm->rm_col[x].rc_gdata,
358 rm->rm_col[x].rc_size);
361 /* fill in the data columns from good_data */
362 buf = (char *)good_data;
363 for (; x < rm->rm_cols; x++) {
364 abd_put(rm->rm_col[x].rc_abd);
365 rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
366 rm->rm_col[x].rc_size);
367 buf += rm->rm_col[x].rc_size;
371 * Construct the parity from the good data.
373 vdev_raidz_generate_parity(rm);
375 /* restore everything back to its original state */
376 for (x = 0; x < rm->rm_firstdatacol; x++) {
377 abd_put(rm->rm_col[x].rc_abd);
378 rm->rm_col[x].rc_abd = bad_parity[x];
382 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
383 abd_put(rm->rm_col[x].rc_abd);
384 rm->rm_col[x].rc_abd = abd_get_offset(
385 rm->rm_abd_copy, offset);
386 offset += rm->rm_col[x].rc_size;
390 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
391 good = rm->rm_col[c].rc_gdata;
393 /* adjust good_data to point at the start of our column */
396 for (x = rm->rm_firstdatacol; x < c; x++)
397 good += rm->rm_col[x].rc_size;
400 bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
401 /* we drop the ereport if it ends up that the data was good */
402 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
403 abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
407 * Invoked indirectly by zfs_ereport_start_checksum(), called
408 * below when our read operation fails completely. The main point
409 * is to keep a copy of everything we read from disk, so that at
410 * vdev_raidz_cksum_finish() time we can compare it with the good data.
413 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
415 size_t c = (size_t)(uintptr_t)arg;
418 raidz_map_t *rm = zio->io_vsd;
421 /* set up the report and bump the refcount */
422 zcr->zcr_cbdata = rm;
424 zcr->zcr_finish = vdev_raidz_cksum_finish;
425 zcr->zcr_free = vdev_raidz_cksum_free;
428 ASSERT3U(rm->rm_reports, >, 0);
430 if (rm->rm_abd_copy != NULL)
434 * It's the first time we're called for this raidz_map_t, so we need
435 * to copy the data aside; there's no guarantee that our zio's buffer
436 * won't be re-used for something else.
438 * Our parity data is already in separate buffers, so there's no need
443 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
444 size += rm->rm_col[c].rc_size;
447 abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
449 for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
450 raidz_col_t *col = &rm->rm_col[c];
451 abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
453 abd_copy(tmp, col->rc_abd, col->rc_size);
454 abd_put(col->rc_abd);
457 offset += col->rc_size;
459 ASSERT3U(offset, ==, size);
462 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
463 vdev_raidz_map_free_vsd,
464 vdev_raidz_cksum_report
468 * Divides the IO evenly across all child vdevs; usually, dcols is
469 * the number of children in the target vdev.
472 vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, boolean_t dofree,
473 uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
476 /* The starting RAIDZ (parent) vdev sector of the block. */
477 uint64_t b = offset >> unit_shift;
478 /* The zio's size in units of the vdev's minimum sector size. */
479 uint64_t s = size >> unit_shift;
480 /* The first column for this stripe. */
481 uint64_t f = b % dcols;
482 /* The starting byte offset on each child vdev. */
483 uint64_t o = (b / dcols) << unit_shift;
484 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
488 * "Quotient": The number of data sectors for this stripe on all but
489 * the "big column" child vdevs that also contain "remainder" data.
491 q = s / (dcols - nparity);
494 * "Remainder": The number of partial stripe data sectors in this I/O.
495 * This will add a sector to some, but not all, child vdevs.
497 r = s - q * (dcols - nparity);
499 /* The number of "big columns" - those which contain remainder data. */
500 bc = (r == 0 ? 0 : r + nparity);
503 * The total number of data and parity sectors associated with
506 tot = s + nparity * (q + (r == 0 ? 0 : 1));
508 /* acols: The columns that will be accessed. */
509 /* scols: The columns that will be accessed or skipped. */
511 /* Our I/O request doesn't span all child vdevs. */
513 scols = MIN(dcols, roundup(bc, nparity + 1));
519 ASSERT3U(acols, <=, scols);
521 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
524 rm->rm_scols = scols;
526 rm->rm_skipstart = bc;
527 rm->rm_missingdata = 0;
528 rm->rm_missingparity = 0;
529 rm->rm_firstdatacol = nparity;
530 rm->rm_abd_copy = NULL;
533 rm->rm_ecksuminjected = 0;
537 for (c = 0; c < scols; c++) {
542 coff += 1ULL << unit_shift;
544 rm->rm_col[c].rc_devidx = col;
545 rm->rm_col[c].rc_offset = coff;
546 rm->rm_col[c].rc_abd = NULL;
547 rm->rm_col[c].rc_gdata = NULL;
548 rm->rm_col[c].rc_error = 0;
549 rm->rm_col[c].rc_tried = 0;
550 rm->rm_col[c].rc_skipped = 0;
553 rm->rm_col[c].rc_size = 0;
555 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
557 rm->rm_col[c].rc_size = q << unit_shift;
559 asize += rm->rm_col[c].rc_size;
562 ASSERT3U(asize, ==, tot << unit_shift);
563 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
564 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
565 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
566 ASSERT3U(rm->rm_nskip, <=, nparity);
569 for (c = 0; c < rm->rm_firstdatacol; c++) {
570 rm->rm_col[c].rc_abd =
571 abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
574 rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
575 off = rm->rm_col[c].rc_size;
577 for (c = c + 1; c < acols; c++) {
578 rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
579 off += rm->rm_col[c].rc_size;
584 * If all data stored spans all columns, there's a danger that parity
585 * will always be on the same device and, since parity isn't read
586 * during normal operation, that that device's I/O bandwidth won't be
587 * used effectively. We therefore switch the parity every 1MB.
589 * ... at least that was, ostensibly, the theory. As a practical
590 * matter unless we juggle the parity between all devices evenly, we
591 * won't see any benefit. Further, occasional writes that aren't a
592 * multiple of the LCM of the number of children and the minimum
593 * stripe width are sufficient to avoid pessimal behavior.
594 * Unfortunately, this decision created an implicit on-disk format
595 * requirement that we need to support for all eternity, but only
596 * for single-parity RAID-Z.
598 * If we intend to skip a sector in the zeroth column for padding
599 * we must make sure to note this swap. We will never intend to
600 * skip the first column since at least one data and one parity
601 * column must appear in each row.
603 ASSERT(rm->rm_cols >= 2);
604 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
606 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
607 devidx = rm->rm_col[0].rc_devidx;
608 o = rm->rm_col[0].rc_offset;
609 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
610 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
611 rm->rm_col[1].rc_devidx = devidx;
612 rm->rm_col[1].rc_offset = o;
614 if (rm->rm_skipstart == 0)
615 rm->rm_skipstart = 1;
628 vdev_raidz_p_func(void *buf, size_t size, void *private)
630 struct pqr_struct *pqr = private;
631 const uint64_t *src = buf;
632 int i, cnt = size / sizeof (src[0]);
634 ASSERT(pqr->p && !pqr->q && !pqr->r);
636 for (i = 0; i < cnt; i++, src++, pqr->p++)
643 vdev_raidz_pq_func(void *buf, size_t size, void *private)
645 struct pqr_struct *pqr = private;
646 const uint64_t *src = buf;
648 int i, cnt = size / sizeof (src[0]);
650 ASSERT(pqr->p && pqr->q && !pqr->r);
652 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
654 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
662 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
664 struct pqr_struct *pqr = private;
665 const uint64_t *src = buf;
667 int i, cnt = size / sizeof (src[0]);
669 ASSERT(pqr->p && pqr->q && pqr->r);
671 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
673 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
675 VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
683 vdev_raidz_generate_parity_p(raidz_map_t *rm)
689 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
690 src = rm->rm_col[c].rc_abd;
691 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
693 if (c == rm->rm_firstdatacol) {
694 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
696 struct pqr_struct pqr = { p, NULL, NULL };
697 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
698 vdev_raidz_p_func, &pqr);
704 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
706 uint64_t *p, *q, pcnt, ccnt, mask, i;
710 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
711 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
712 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
714 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
715 src = rm->rm_col[c].rc_abd;
716 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
717 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
719 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
721 if (c == rm->rm_firstdatacol) {
722 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
723 (void) memcpy(q, p, rm->rm_col[c].rc_size);
725 struct pqr_struct pqr = { p, q, NULL };
726 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
727 vdev_raidz_pq_func, &pqr);
730 if (c == rm->rm_firstdatacol) {
731 for (i = ccnt; i < pcnt; i++) {
737 * Treat short columns as though they are full of 0s.
738 * Note that there's therefore nothing needed for P.
740 for (i = ccnt; i < pcnt; i++) {
741 VDEV_RAIDZ_64MUL_2(q[i], mask);
748 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
750 uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
754 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
755 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
756 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
757 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
758 rm->rm_col[VDEV_RAIDZ_R].rc_size);
760 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
761 src = rm->rm_col[c].rc_abd;
762 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
763 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
764 r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
766 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
768 if (c == rm->rm_firstdatacol) {
769 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
770 (void) memcpy(q, p, rm->rm_col[c].rc_size);
771 (void) memcpy(r, p, rm->rm_col[c].rc_size);
773 struct pqr_struct pqr = { p, q, r };
774 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
775 vdev_raidz_pqr_func, &pqr);
778 if (c == rm->rm_firstdatacol) {
779 for (i = ccnt; i < pcnt; i++) {
786 * Treat short columns as though they are full of 0s.
787 * Note that there's therefore nothing needed for P.
789 for (i = ccnt; i < pcnt; i++) {
790 VDEV_RAIDZ_64MUL_2(q[i], mask);
791 VDEV_RAIDZ_64MUL_4(r[i], mask);
798 * Generate RAID parity in the first virtual columns according to the number of
799 * parity columns available.
802 vdev_raidz_generate_parity(raidz_map_t *rm)
804 switch (rm->rm_firstdatacol) {
806 vdev_raidz_generate_parity_p(rm);
809 vdev_raidz_generate_parity_pq(rm);
812 vdev_raidz_generate_parity_pqr(rm);
815 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
821 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
823 uint64_t *dst = dbuf;
824 uint64_t *src = sbuf;
825 int cnt = size / sizeof (src[0]);
827 for (int i = 0; i < cnt; i++) {
836 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
839 uint64_t *dst = dbuf;
840 uint64_t *src = sbuf;
842 int cnt = size / sizeof (dst[0]);
844 for (int i = 0; i < cnt; i++, dst++, src++) {
845 VDEV_RAIDZ_64MUL_2(*dst, mask);
854 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
858 int cnt = size / sizeof (dst[0]);
860 for (int i = 0; i < cnt; i++, dst++) {
861 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
862 VDEV_RAIDZ_64MUL_2(*dst, mask);
868 struct reconst_q_struct {
874 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
876 struct reconst_q_struct *rq = private;
878 int cnt = size / sizeof (dst[0]);
880 for (int i = 0; i < cnt; i++, dst++, rq->q++) {
885 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
886 *b = vdev_raidz_exp2(*b, rq->exp);
893 struct reconst_pq_struct {
903 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
905 struct reconst_pq_struct *rpq = private;
909 for (int i = 0; i < size;
910 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
911 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
912 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
913 *yd = *rpq->p ^ *rpq->pxy ^ *xd;
920 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
922 struct reconst_pq_struct *rpq = private;
925 for (int i = 0; i < size;
926 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
927 /* same operation as vdev_raidz_reconst_pq_func() on xd */
928 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
929 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
936 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
943 ASSERT(x >= rm->rm_firstdatacol);
944 ASSERT(x < rm->rm_cols);
946 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
947 ASSERT(rm->rm_col[x].rc_size > 0);
949 src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
950 dst = rm->rm_col[x].rc_abd;
952 abd_copy(dst, src, rm->rm_col[x].rc_size);
954 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
955 uint64_t size = MIN(rm->rm_col[x].rc_size,
956 rm->rm_col[c].rc_size);
958 src = rm->rm_col[c].rc_abd;
959 dst = rm->rm_col[x].rc_abd;
964 (void) abd_iterate_func2(dst, src, 0, 0, size,
965 vdev_raidz_reconst_p_func, NULL);
968 return (1 << VDEV_RAIDZ_P);
972 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
980 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
982 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
983 uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
984 rm->rm_col[c].rc_size);
986 src = rm->rm_col[c].rc_abd;
987 dst = rm->rm_col[x].rc_abd;
989 if (c == rm->rm_firstdatacol) {
990 abd_copy(dst, src, size);
991 if (rm->rm_col[x].rc_size > size)
992 abd_zero_off(dst, size,
993 rm->rm_col[x].rc_size - size);
995 ASSERT3U(size, <=, rm->rm_col[x].rc_size);
996 (void) abd_iterate_func2(dst, src, 0, 0, size,
997 vdev_raidz_reconst_q_pre_func, NULL);
998 (void) abd_iterate_func(dst,
999 size, rm->rm_col[x].rc_size - size,
1000 vdev_raidz_reconst_q_pre_tail_func, NULL);
1004 src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1005 dst = rm->rm_col[x].rc_abd;
1006 exp = 255 - (rm->rm_cols - 1 - x);
1008 struct reconst_q_struct rq = { abd_to_buf(src), exp };
1009 (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
1010 vdev_raidz_reconst_q_post_func, &rq);
1012 return (1 << VDEV_RAIDZ_Q);
1016 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
1018 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1019 abd_t *pdata, *qdata;
1020 uint64_t xsize, ysize;
1027 ASSERT(x >= rm->rm_firstdatacol);
1028 ASSERT(y < rm->rm_cols);
1030 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
1033 * Move the parity data aside -- we're going to compute parity as
1034 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1035 * reuse the parity generation mechanism without trashing the actual
1036 * parity so we make those columns appear to be full of zeros by
1037 * setting their lengths to zero.
1039 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
1040 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1041 xsize = rm->rm_col[x].rc_size;
1042 ysize = rm->rm_col[y].rc_size;
1044 rm->rm_col[VDEV_RAIDZ_P].rc_abd =
1045 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1046 rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
1047 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1048 rm->rm_col[x].rc_size = 0;
1049 rm->rm_col[y].rc_size = 0;
1051 vdev_raidz_generate_parity_pq(rm);
1053 rm->rm_col[x].rc_size = xsize;
1054 rm->rm_col[y].rc_size = ysize;
1056 p = abd_to_buf(pdata);
1057 q = abd_to_buf(qdata);
1058 pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1059 qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1060 xd = rm->rm_col[x].rc_abd;
1061 yd = rm->rm_col[y].rc_abd;
1065 * Pxy = P + D_x + D_y
1066 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1068 * We can then solve for D_x:
1069 * D_x = A * (P + Pxy) + B * (Q + Qxy)
1071 * A = 2^(x - y) * (2^(x - y) + 1)^-1
1072 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1074 * With D_x in hand, we can easily solve for D_y:
1075 * D_y = P + Pxy + D_x
1078 a = vdev_raidz_pow2[255 + x - y];
1079 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
1080 tmp = 255 - vdev_raidz_log2[a ^ 1];
1082 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1083 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1085 ASSERT3U(xsize, >=, ysize);
1086 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1087 (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1088 vdev_raidz_reconst_pq_func, &rpq);
1089 (void) abd_iterate_func(xd, ysize, xsize - ysize,
1090 vdev_raidz_reconst_pq_tail_func, &rpq);
1092 abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1093 abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1096 * Restore the saved parity data.
1098 rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
1099 rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1101 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
1106 * In the general case of reconstruction, we must solve the system of linear
1107 * equations defined by the coeffecients used to generate parity as well as
1108 * the contents of the data and parity disks. This can be expressed with
1109 * vectors for the original data (D) and the actual data (d) and parity (p)
1110 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1114 * | V | | D_0 | | p_m-1 |
1115 * | | x | : | = | d_0 |
1116 * | I | | D_n-1 | | : |
1117 * | | ~~ ~~ | d_n-1 |
1120 * I is simply a square identity matrix of size n, and V is a vandermonde
1121 * matrix defined by the coeffecients we chose for the various parity columns
1122 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1123 * computation as well as linear separability.
1126 * | 1 .. 1 1 1 | | p_0 |
1127 * | 2^n-1 .. 4 2 1 | __ __ | : |
1128 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
1129 * | 1 .. 0 0 0 | | D_1 | | d_0 |
1130 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
1131 * | : : : : | | : | | d_2 |
1132 * | 0 .. 1 0 0 | | D_n-1 | | : |
1133 * | 0 .. 0 1 0 | ~~ ~~ | : |
1134 * | 0 .. 0 0 1 | | d_n-1 |
1137 * Note that I, V, d, and p are known. To compute D, we must invert the
1138 * matrix and use the known data and parity values to reconstruct the unknown
1139 * data values. We begin by removing the rows in V|I and d|p that correspond
1140 * to failed or missing columns; we then make V|I square (n x n) and d|p
1141 * sized n by removing rows corresponding to unused parity from the bottom up
1142 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1143 * using Gauss-Jordan elimination. In the example below we use m=3 parity
1144 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1146 * | 1 1 1 1 1 1 1 1 |
1147 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
1148 * | 19 205 116 29 64 16 4 1 | / /
1149 * | 1 0 0 0 0 0 0 0 | / /
1150 * | 0 1 0 0 0 0 0 0 | <--' /
1151 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
1152 * | 0 0 0 1 0 0 0 0 |
1153 * | 0 0 0 0 1 0 0 0 |
1154 * | 0 0 0 0 0 1 0 0 |
1155 * | 0 0 0 0 0 0 1 0 |
1156 * | 0 0 0 0 0 0 0 1 |
1159 * | 1 1 1 1 1 1 1 1 |
1160 * | 19 205 116 29 64 16 4 1 |
1161 * | 1 0 0 0 0 0 0 0 |
1162 * (V|I)' = | 0 0 0 1 0 0 0 0 |
1163 * | 0 0 0 0 1 0 0 0 |
1164 * | 0 0 0 0 0 1 0 0 |
1165 * | 0 0 0 0 0 0 1 0 |
1166 * | 0 0 0 0 0 0 0 1 |
1169 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1170 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1171 * matrix is not singular.
1173 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1174 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1175 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1176 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1177 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1178 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1179 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1180 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1183 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1184 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1185 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1186 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1187 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1188 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1189 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1190 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1193 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1194 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1195 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1196 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1197 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1198 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1199 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1200 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1203 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1204 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1205 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1206 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1207 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1208 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1209 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1210 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1213 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1214 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1215 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1216 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1217 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1218 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1219 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1220 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1223 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1224 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1225 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1226 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1227 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1228 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1229 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1230 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1233 * | 0 0 1 0 0 0 0 0 |
1234 * | 167 100 5 41 159 169 217 208 |
1235 * | 166 100 4 40 158 168 216 209 |
1236 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1237 * | 0 0 0 0 1 0 0 0 |
1238 * | 0 0 0 0 0 1 0 0 |
1239 * | 0 0 0 0 0 0 1 0 |
1240 * | 0 0 0 0 0 0 0 1 |
1243 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1244 * of the missing data.
1246 * As is apparent from the example above, the only non-trivial rows in the
1247 * inverse matrix correspond to the data disks that we're trying to
1248 * reconstruct. Indeed, those are the only rows we need as the others would
1249 * only be useful for reconstructing data known or assumed to be valid. For
1250 * that reason, we only build the coefficients in the rows that correspond to
1256 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1262 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1265 * Fill in the missing rows of interest.
1267 for (i = 0; i < nmap; i++) {
1268 ASSERT3S(0, <=, map[i]);
1269 ASSERT3S(map[i], <=, 2);
1276 for (j = 0; j < n; j++) {
1280 rows[i][j] = vdev_raidz_pow2[pow];
1286 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1287 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1293 * Assert that the first nmissing entries from the array of used
1294 * columns correspond to parity columns and that subsequent entries
1295 * correspond to data columns.
1297 for (i = 0; i < nmissing; i++) {
1298 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1300 for (; i < n; i++) {
1301 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1305 * First initialize the storage where we'll compute the inverse rows.
1307 for (i = 0; i < nmissing; i++) {
1308 for (j = 0; j < n; j++) {
1309 invrows[i][j] = (i == j) ? 1 : 0;
1314 * Subtract all trivial rows from the rows of consequence.
1316 for (i = 0; i < nmissing; i++) {
1317 for (j = nmissing; j < n; j++) {
1318 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1319 jj = used[j] - rm->rm_firstdatacol;
1321 invrows[i][j] = rows[i][jj];
1327 * For each of the rows of interest, we must normalize it and subtract
1328 * a multiple of it from the other rows.
1330 for (i = 0; i < nmissing; i++) {
1331 for (j = 0; j < missing[i]; j++) {
1332 ASSERT0(rows[i][j]);
1334 ASSERT3U(rows[i][missing[i]], !=, 0);
1337 * Compute the inverse of the first element and multiply each
1338 * element in the row by that value.
1340 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1342 for (j = 0; j < n; j++) {
1343 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1344 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1347 for (ii = 0; ii < nmissing; ii++) {
1351 ASSERT3U(rows[ii][missing[i]], !=, 0);
1353 log = vdev_raidz_log2[rows[ii][missing[i]]];
1355 for (j = 0; j < n; j++) {
1357 vdev_raidz_exp2(rows[i][j], log);
1359 vdev_raidz_exp2(invrows[i][j], log);
1365 * Verify that the data that is left in the rows are properly part of
1366 * an identity matrix.
1368 for (i = 0; i < nmissing; i++) {
1369 for (j = 0; j < n; j++) {
1370 if (j == missing[i]) {
1371 ASSERT3U(rows[i][j], ==, 1);
1373 ASSERT0(rows[i][j]);
1380 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1381 int *missing, uint8_t **invrows, const uint8_t *used)
1386 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1387 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1391 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1395 psize = sizeof (invlog[0][0]) * n * nmissing;
1396 p = kmem_alloc(psize, KM_SLEEP);
1398 for (pp = p, i = 0; i < nmissing; i++) {
1403 for (i = 0; i < nmissing; i++) {
1404 for (j = 0; j < n; j++) {
1405 ASSERT3U(invrows[i][j], !=, 0);
1406 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1410 for (i = 0; i < n; i++) {
1412 ASSERT3U(c, <, rm->rm_cols);
1414 src = abd_to_buf(rm->rm_col[c].rc_abd);
1415 ccount = rm->rm_col[c].rc_size;
1416 for (j = 0; j < nmissing; j++) {
1417 cc = missing[j] + rm->rm_firstdatacol;
1418 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1419 ASSERT3U(cc, <, rm->rm_cols);
1420 ASSERT3U(cc, !=, c);
1422 dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
1423 dcount[j] = rm->rm_col[cc].rc_size;
1426 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1428 for (x = 0; x < ccount; x++, src++) {
1430 log = vdev_raidz_log2[*src];
1432 for (cc = 0; cc < nmissing; cc++) {
1433 if (x >= dcount[cc])
1439 if ((ll = log + invlog[cc][i]) >= 255)
1441 val = vdev_raidz_pow2[ll];
1452 kmem_free(p, psize);
1456 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1460 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1461 int parity_map[VDEV_RAIDZ_MAXPARITY];
1466 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1467 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1470 abd_t **bufs = NULL;
1475 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1476 * temporary linear ABDs.
1478 if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
1479 bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
1481 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1482 raidz_col_t *col = &rm->rm_col[c];
1484 bufs[c] = col->rc_abd;
1485 col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
1486 abd_copy(col->rc_abd, bufs[c], col->rc_size);
1490 n = rm->rm_cols - rm->rm_firstdatacol;
1493 * Figure out which data columns are missing.
1496 for (t = 0; t < ntgts; t++) {
1497 if (tgts[t] >= rm->rm_firstdatacol) {
1498 missing_rows[nmissing_rows++] =
1499 tgts[t] - rm->rm_firstdatacol;
1504 * Figure out which parity columns to use to help generate the missing
1507 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1509 ASSERT(c < rm->rm_firstdatacol);
1512 * Skip any targeted parity columns.
1514 if (c == tgts[tt]) {
1526 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1528 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1529 nmissing_rows * n + sizeof (used[0]) * n;
1530 p = kmem_alloc(psize, KM_SLEEP);
1532 for (pp = p, i = 0; i < nmissing_rows; i++) {
1540 for (i = 0; i < nmissing_rows; i++) {
1541 used[i] = parity_map[i];
1544 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1545 if (tt < nmissing_rows &&
1546 c == missing_rows[tt] + rm->rm_firstdatacol) {
1557 * Initialize the interesting rows of the matrix.
1559 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1562 * Invert the matrix.
1564 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1568 * Reconstruct the missing data using the generated matrix.
1570 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1573 kmem_free(p, psize);
1576 * copy back from temporary linear abds and free them
1579 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1580 raidz_col_t *col = &rm->rm_col[c];
1582 abd_copy(bufs[c], col->rc_abd, col->rc_size);
1583 abd_free(col->rc_abd);
1584 col->rc_abd = bufs[c];
1586 kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
1593 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1595 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1599 int nbadparity, nbaddata;
1600 int parity_valid[VDEV_RAIDZ_MAXPARITY];
1603 * The tgts list must already be sorted.
1605 for (i = 1; i < nt; i++) {
1606 ASSERT(t[i] > t[i - 1]);
1609 nbadparity = rm->rm_firstdatacol;
1610 nbaddata = rm->rm_cols - nbadparity;
1612 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1613 if (c < rm->rm_firstdatacol)
1614 parity_valid[c] = B_FALSE;
1616 if (i < nt && c == t[i]) {
1619 } else if (rm->rm_col[c].rc_error != 0) {
1621 } else if (c >= rm->rm_firstdatacol) {
1624 parity_valid[c] = B_TRUE;
1629 ASSERT(ntgts >= nt);
1630 ASSERT(nbaddata >= 0);
1631 ASSERT(nbaddata + nbadparity == ntgts);
1633 dt = &tgts[nbadparity];
1636 * See if we can use any of our optimized reconstruction routines.
1638 if (!vdev_raidz_default_to_general) {
1641 if (parity_valid[VDEV_RAIDZ_P])
1642 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1644 ASSERT(rm->rm_firstdatacol > 1);
1646 if (parity_valid[VDEV_RAIDZ_Q])
1647 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1649 ASSERT(rm->rm_firstdatacol > 2);
1653 ASSERT(rm->rm_firstdatacol > 1);
1655 if (parity_valid[VDEV_RAIDZ_P] &&
1656 parity_valid[VDEV_RAIDZ_Q])
1657 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1659 ASSERT(rm->rm_firstdatacol > 2);
1665 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1666 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1672 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1673 uint64_t *logical_ashift, uint64_t *physical_ashift)
1676 uint64_t nparity = vd->vdev_nparity;
1681 ASSERT(nparity > 0);
1683 if (nparity > VDEV_RAIDZ_MAXPARITY ||
1684 vd->vdev_children < nparity + 1) {
1685 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1686 return (SET_ERROR(EINVAL));
1689 vdev_open_children(vd);
1691 for (c = 0; c < vd->vdev_children; c++) {
1692 cvd = vd->vdev_child[c];
1694 if (cvd->vdev_open_error != 0) {
1695 lasterror = cvd->vdev_open_error;
1700 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1701 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1702 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1703 *physical_ashift = MAX(*physical_ashift,
1704 cvd->vdev_physical_ashift);
1707 *asize *= vd->vdev_children;
1708 *max_asize *= vd->vdev_children;
1710 if (numerrors > nparity) {
1711 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1719 vdev_raidz_close(vdev_t *vd)
1723 for (c = 0; c < vd->vdev_children; c++)
1724 vdev_close(vd->vdev_child[c]);
1729 * Handle a read or write I/O to a RAID-Z dump device.
1731 * The dump device is in a unique situation compared to other ZFS datasets:
1732 * writing to this device should be as simple and fast as possible. In
1733 * addition, durability matters much less since the dump will be extracted
1734 * once the machine reboots. For that reason, this function eschews parity for
1735 * performance and simplicity. The dump device uses the checksum setting
1736 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1739 * Blocks of size 128 KB have been preallocated for this volume. I/Os less than
1740 * 128 KB will not fill an entire block; in addition, they may not be properly
1741 * aligned. In that case, this function uses the preallocated 128 KB block and
1742 * omits reading or writing any "empty" portions of that block, as opposed to
1743 * allocating a fresh appropriately-sized block.
1745 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1747 * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1749 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1750 * allocated which spans all five child vdevs. 8 KB of data would be written to
1751 * each of four vdevs, with the fifth containing the parity bits.
1753 * parity data data data data
1754 * | PP | XX | XX | XX | XX |
1757 * 8 KB parity ------8 KB data blocks------
1759 * However, when writing to the dump device, the behavior is different:
1761 * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1763 * Unlike the normal RAID-Z case in which the block is allocated based on the
1764 * I/O size, reads and writes here always use a 128 KB logical I/O size. If the
1765 * I/O size is less than 128 KB, only the actual portions of data are written.
1766 * In this example the data is written to the third data vdev since that vdev
1767 * contains the offset [64 KB, 96 KB).
1769 * parity data data data data
1775 * As a result, an individual I/O may not span all child vdevs; moreover, a
1776 * small I/O may only operate on a single child vdev.
1778 * Note that since there are no parity bits calculated or written, this format
1779 * remains the same no matter how many parity bits are used in a normal RAID-Z
1780 * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above
1783 * parity parity parity data data data data
1784 * | | | | | | XX | |
1790 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1791 uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1793 vdev_t *tvd = vd->vdev_top;
1799 uint64_t start, end, colstart, colend;
1800 uint64_t coloffset, colsize, colskip;
1802 int flags = doread ? BIO_READ : BIO_WRITE;
1807 * Don't write past the end of the block
1809 VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1815 * Allocate a RAID-Z map for this block. Note that this block starts
1816 * from the "original" offset, this is, the offset of the extent which
1817 * contains the requisite offset of the data being read or written.
1819 * Even if this I/O operation doesn't span the full block size, let's
1820 * treat the on-disk format as if the only blocks are the complete 128
1823 abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
1824 SPA_OLD_MAXBLOCKSIZE);
1825 rm = vdev_raidz_map_alloc(abd,
1826 SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
1827 vd->vdev_children, vd->vdev_nparity);
1829 coloffset = origoffset;
1831 for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1832 c++, coloffset += rc->rc_size) {
1833 rc = &rm->rm_col[c];
1834 cvd = vd->vdev_child[rc->rc_devidx];
1837 * Find the start and end of this column in the RAID-Z map,
1838 * keeping in mind that the stated size and offset of the
1839 * operation may not fill the entire column for this vdev.
1841 * If any portion of the data spans this column, issue the
1842 * appropriate operation to the vdev.
1844 if (coloffset + rc->rc_size <= start)
1846 if (coloffset >= end)
1849 colstart = MAX(coloffset, start);
1850 colend = MIN(end, coloffset + rc->rc_size);
1851 colsize = colend - colstart;
1852 colskip = colstart - coloffset;
1854 VERIFY3U(colsize, <=, rc->rc_size);
1855 VERIFY3U(colskip, <=, rc->rc_size);
1858 * Note that the child vdev will have a vdev label at the start
1859 * of its range of offsets, hence the need for
1860 * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another
1861 * example of why this calculation is needed.
1863 if ((err = vdev_disk_physio(cvd,
1864 ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize,
1865 VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1866 flags, isdump)) != 0)
1870 vdev_raidz_map_free(rm);
1879 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1882 uint64_t ashift = vd->vdev_top->vdev_ashift;
1883 uint64_t cols = vd->vdev_children;
1884 uint64_t nparity = vd->vdev_nparity;
1886 asize = ((psize - 1) >> ashift) + 1;
1887 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1888 asize = roundup(asize, nparity + 1) << ashift;
1894 vdev_raidz_child_done(zio_t *zio)
1896 raidz_col_t *rc = zio->io_private;
1898 rc->rc_error = zio->io_error;
1904 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col)
1907 vdev_t *vd = zio->io_vd;
1908 vdev_t *tvd = vd->vdev_top;
1910 range_seg_t logical_rs, physical_rs;
1911 logical_rs.rs_start = zio->io_offset;
1912 logical_rs.rs_end = logical_rs.rs_start +
1913 vdev_raidz_asize(zio->io_vd, zio->io_size);
1915 raidz_col_t *rc = &rm->rm_col[col];
1916 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
1918 vdev_xlate(cvd, &logical_rs, &physical_rs);
1919 ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
1920 ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
1922 * It would be nice to assert that rs_end is equal
1923 * to rc_offset + rc_size but there might be an
1924 * optional I/O at the end that is not accounted in
1927 if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
1928 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
1929 rc->rc_size + (1 << tvd->vdev_ashift));
1931 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
1937 * Start an IO operation on a RAIDZ VDev
1940 * - For write operations:
1941 * 1. Generate the parity data
1942 * 2. Create child zio write operations to each column's vdev, for both
1944 * 3. If the column skips any sectors for padding, create optional dummy
1945 * write zio children for those areas to improve aggregation continuity.
1946 * - For read operations:
1947 * 1. Create child zio read operations to each data column's vdev to read
1948 * the range of data required for zio.
1949 * 2. If this is a scrub or resilver operation, or if any of the data
1950 * vdevs have had errors, then create zio read operations to the parity
1951 * columns' VDevs as well.
1954 vdev_raidz_io_start(zio_t *zio)
1956 vdev_t *vd = zio->io_vd;
1957 vdev_t *tvd = vd->vdev_top;
1963 rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
1964 zio->io_type == ZIO_TYPE_FREE,
1965 tvd->vdev_ashift, vd->vdev_children,
1969 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1971 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1973 if (zio->io_type == ZIO_TYPE_FREE) {
1974 for (c = 0; c < rm->rm_cols; c++) {
1975 rc = &rm->rm_col[c];
1976 cvd = vd->vdev_child[rc->rc_devidx];
1977 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1978 rc->rc_offset, rc->rc_abd, rc->rc_size,
1979 zio->io_type, zio->io_priority, 0,
1980 vdev_raidz_child_done, rc));
1987 if (zio->io_type == ZIO_TYPE_WRITE) {
1988 vdev_raidz_generate_parity(rm);
1990 for (c = 0; c < rm->rm_cols; c++) {
1991 rc = &rm->rm_col[c];
1992 cvd = vd->vdev_child[rc->rc_devidx];
1995 * Verify physical to logical translation.
1997 vdev_raidz_io_verify(zio, rm, c);
1999 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2000 rc->rc_offset, rc->rc_abd, rc->rc_size,
2001 zio->io_type, zio->io_priority, 0,
2002 vdev_raidz_child_done, rc));
2006 * Generate optional I/Os for any skipped sectors to improve
2007 * aggregation contiguity.
2009 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
2010 ASSERT(c <= rm->rm_scols);
2011 if (c == rm->rm_scols)
2013 rc = &rm->rm_col[c];
2014 cvd = vd->vdev_child[rc->rc_devidx];
2015 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2016 rc->rc_offset + rc->rc_size, NULL,
2017 1 << tvd->vdev_ashift,
2018 zio->io_type, zio->io_priority,
2019 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
2026 ASSERT(zio->io_type == ZIO_TYPE_READ);
2029 * Iterate over the columns in reverse order so that we hit the parity
2030 * last -- any errors along the way will force us to read the parity.
2032 for (c = rm->rm_cols - 1; c >= 0; c--) {
2033 rc = &rm->rm_col[c];
2034 cvd = vd->vdev_child[rc->rc_devidx];
2035 if (!vdev_readable(cvd)) {
2036 if (c >= rm->rm_firstdatacol)
2037 rm->rm_missingdata++;
2039 rm->rm_missingparity++;
2040 rc->rc_error = SET_ERROR(ENXIO);
2041 rc->rc_tried = 1; /* don't even try */
2045 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2046 if (c >= rm->rm_firstdatacol)
2047 rm->rm_missingdata++;
2049 rm->rm_missingparity++;
2050 rc->rc_error = SET_ERROR(ESTALE);
2054 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
2055 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
2056 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2057 rc->rc_offset, rc->rc_abd, rc->rc_size,
2058 zio->io_type, zio->io_priority, 0,
2059 vdev_raidz_child_done, rc));
2068 * Report a checksum error for a child of a RAID-Z device.
2071 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
2074 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2076 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2077 zio_bad_cksum_t zbc;
2078 raidz_map_t *rm = zio->io_vsd;
2080 mutex_enter(&vd->vdev_stat_lock);
2081 vd->vdev_stat.vs_checksum_errors++;
2082 mutex_exit(&vd->vdev_stat_lock);
2084 zbc.zbc_has_cksum = 0;
2085 zbc.zbc_injected = rm->rm_ecksuminjected;
2087 buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
2088 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
2089 rc->rc_offset, rc->rc_size, buf, bad_data,
2091 abd_return_buf(rc->rc_abd, buf, rc->rc_size);
2096 * We keep track of whether or not there were any injected errors, so that
2097 * any ereports we generate can note it.
2100 raidz_checksum_verify(zio_t *zio)
2102 zio_bad_cksum_t zbc;
2103 raidz_map_t *rm = zio->io_vsd;
2105 int ret = zio_checksum_error(zio, &zbc);
2106 if (ret != 0 && zbc.zbc_injected != 0)
2107 rm->rm_ecksuminjected = 1;
2113 * Generate the parity from the data columns. If we tried and were able to
2114 * read the parity without error, verify that the generated parity matches the
2115 * data we read. If it doesn't, we fire off a checksum error. Return the
2116 * number such failures.
2119 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
2121 void *orig[VDEV_RAIDZ_MAXPARITY];
2125 blkptr_t *bp = zio->io_bp;
2126 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2127 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2129 if (checksum == ZIO_CHECKSUM_NOPARITY)
2132 for (c = 0; c < rm->rm_firstdatacol; c++) {
2133 rc = &rm->rm_col[c];
2134 if (!rc->rc_tried || rc->rc_error != 0)
2136 orig[c] = zio_buf_alloc(rc->rc_size);
2137 abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
2140 vdev_raidz_generate_parity(rm);
2142 for (c = 0; c < rm->rm_firstdatacol; c++) {
2143 rc = &rm->rm_col[c];
2144 if (!rc->rc_tried || rc->rc_error != 0)
2146 if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) {
2147 raidz_checksum_error(zio, rc, orig[c]);
2148 rc->rc_error = SET_ERROR(ECKSUM);
2151 zio_buf_free(orig[c], rc->rc_size);
2158 * Keep statistics on all the ways that we used parity to correct data.
2160 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
2163 vdev_raidz_worst_error(raidz_map_t *rm)
2167 for (int c = 0; c < rm->rm_cols; c++)
2168 error = zio_worst_error(error, rm->rm_col[c].rc_error);
2174 * Iterate over all combinations of bad data and attempt a reconstruction.
2175 * Note that the algorithm below is non-optimal because it doesn't take into
2176 * account how reconstruction is actually performed. For example, with
2177 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
2178 * is targeted as invalid as if columns 1 and 4 are targeted since in both
2179 * cases we'd only use parity information in column 0.
2182 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
2184 raidz_map_t *rm = zio->io_vsd;
2186 void *orig[VDEV_RAIDZ_MAXPARITY];
2187 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
2188 int *tgts = &tstore[1];
2189 int current, next, i, c, n;
2192 ASSERT(total_errors < rm->rm_firstdatacol);
2195 * This simplifies one edge condition.
2199 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
2201 * Initialize the targets array by finding the first n columns
2202 * that contain no error.
2204 * If there were no data errors, we need to ensure that we're
2205 * always explicitly attempting to reconstruct at least one
2206 * data column. To do this, we simply push the highest target
2207 * up into the data columns.
2209 for (c = 0, i = 0; i < n; i++) {
2210 if (i == n - 1 && data_errors == 0 &&
2211 c < rm->rm_firstdatacol) {
2212 c = rm->rm_firstdatacol;
2215 while (rm->rm_col[c].rc_error != 0) {
2217 ASSERT3S(c, <, rm->rm_cols);
2224 * Setting tgts[n] simplifies the other edge condition.
2226 tgts[n] = rm->rm_cols;
2229 * These buffers were allocated in previous iterations.
2231 for (i = 0; i < n - 1; i++) {
2232 ASSERT(orig[i] != NULL);
2235 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2238 next = tgts[current];
2240 while (current != n) {
2241 tgts[current] = next;
2245 * Save off the original data that we're going to
2246 * attempt to reconstruct.
2248 for (i = 0; i < n; i++) {
2249 ASSERT(orig[i] != NULL);
2252 ASSERT3S(c, <, rm->rm_cols);
2253 rc = &rm->rm_col[c];
2254 abd_copy_to_buf(orig[i], rc->rc_abd,
2259 * Attempt a reconstruction and exit the outer loop on
2262 code = vdev_raidz_reconstruct(rm, tgts, n);
2263 if (raidz_checksum_verify(zio) == 0) {
2264 atomic_inc_64(&raidz_corrected[code]);
2266 for (i = 0; i < n; i++) {
2268 rc = &rm->rm_col[c];
2269 ASSERT(rc->rc_error == 0);
2271 raidz_checksum_error(zio, rc,
2273 rc->rc_error = SET_ERROR(ECKSUM);
2281 * Restore the original data.
2283 for (i = 0; i < n; i++) {
2285 rc = &rm->rm_col[c];
2286 abd_copy_from_buf(rc->rc_abd, orig[i],
2292 * Find the next valid column after the current
2295 for (next = tgts[current] + 1;
2296 next < rm->rm_cols &&
2297 rm->rm_col[next].rc_error != 0; next++)
2300 ASSERT(next <= tgts[current + 1]);
2303 * If that spot is available, we're done here.
2305 if (next != tgts[current + 1])
2309 * Otherwise, find the next valid column after
2310 * the previous position.
2312 for (c = tgts[current - 1] + 1;
2313 rm->rm_col[c].rc_error != 0; c++)
2319 } while (current != n);
2324 for (i = 0; i < n; i++) {
2325 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2332 * Complete an IO operation on a RAIDZ VDev
2335 * - For write operations:
2336 * 1. Check for errors on the child IOs.
2337 * 2. Return, setting an error code if too few child VDevs were written
2338 * to reconstruct the data later. Note that partial writes are
2339 * considered successful if they can be reconstructed at all.
2340 * - For read operations:
2341 * 1. Check for errors on the child IOs.
2342 * 2. If data errors occurred:
2343 * a. Try to reassemble the data from the parity available.
2344 * b. If we haven't yet read the parity drives, read them now.
2345 * c. If all parity drives have been read but the data still doesn't
2346 * reassemble with a correct checksum, then try combinatorial
2348 * d. If that doesn't work, return an error.
2349 * 3. If there were unexpected errors or this is a resilver operation,
2350 * rewrite the vdevs that had errors.
2353 vdev_raidz_io_done(zio_t *zio)
2355 vdev_t *vd = zio->io_vd;
2357 raidz_map_t *rm = zio->io_vsd;
2359 int unexpected_errors = 0;
2360 int parity_errors = 0;
2361 int parity_untried = 0;
2362 int data_errors = 0;
2363 int total_errors = 0;
2365 int tgts[VDEV_RAIDZ_MAXPARITY];
2368 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
2370 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2371 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2373 for (c = 0; c < rm->rm_cols; c++) {
2374 rc = &rm->rm_col[c];
2377 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
2379 if (c < rm->rm_firstdatacol)
2384 if (!rc->rc_skipped)
2385 unexpected_errors++;
2388 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2393 if (zio->io_type == ZIO_TYPE_WRITE) {
2395 * XXX -- for now, treat partial writes as a success.
2396 * (If we couldn't write enough columns to reconstruct
2397 * the data, the I/O failed. Otherwise, good enough.)
2399 * Now that we support write reallocation, it would be better
2400 * to treat partial failure as real failure unless there are
2401 * no non-degraded top-level vdevs left, and not update DTLs
2402 * if we intend to reallocate.
2405 if (total_errors > rm->rm_firstdatacol)
2406 zio->io_error = vdev_raidz_worst_error(rm);
2409 } else if (zio->io_type == ZIO_TYPE_FREE) {
2413 ASSERT(zio->io_type == ZIO_TYPE_READ);
2415 * There are three potential phases for a read:
2416 * 1. produce valid data from the columns read
2417 * 2. read all disks and try again
2418 * 3. perform combinatorial reconstruction
2420 * Each phase is progressively both more expensive and less likely to
2421 * occur. If we encounter more errors than we can repair or all phases
2422 * fail, we have no choice but to return an error.
2426 * If the number of errors we saw was correctable -- less than or equal
2427 * to the number of parity disks read -- attempt to produce data that
2428 * has a valid checksum. Naturally, this case applies in the absence of
2431 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2432 if (data_errors == 0) {
2433 if (raidz_checksum_verify(zio) == 0) {
2435 * If we read parity information (unnecessarily
2436 * as it happens since no reconstruction was
2437 * needed) regenerate and verify the parity.
2438 * We also regenerate parity when resilvering
2439 * so we can write it out to the failed device
2442 if (parity_errors + parity_untried <
2443 rm->rm_firstdatacol ||
2444 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2445 n = raidz_parity_verify(zio, rm);
2446 unexpected_errors += n;
2447 ASSERT(parity_errors + n <=
2448 rm->rm_firstdatacol);
2454 * We either attempt to read all the parity columns or
2455 * none of them. If we didn't try to read parity, we
2456 * wouldn't be here in the correctable case. There must
2457 * also have been fewer parity errors than parity
2458 * columns or, again, we wouldn't be in this code path.
2460 ASSERT(parity_untried == 0);
2461 ASSERT(parity_errors < rm->rm_firstdatacol);
2464 * Identify the data columns that reported an error.
2467 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2468 rc = &rm->rm_col[c];
2469 if (rc->rc_error != 0) {
2470 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2475 ASSERT(rm->rm_firstdatacol >= n);
2477 code = vdev_raidz_reconstruct(rm, tgts, n);
2479 if (raidz_checksum_verify(zio) == 0) {
2480 atomic_inc_64(&raidz_corrected[code]);
2483 * If we read more parity disks than were used
2484 * for reconstruction, confirm that the other
2485 * parity disks produced correct data. This
2486 * routine is suboptimal in that it regenerates
2487 * the parity that we already used in addition
2488 * to the parity that we're attempting to
2489 * verify, but this should be a relatively
2490 * uncommon case, and can be optimized if it
2491 * becomes a problem. Note that we regenerate
2492 * parity when resilvering so we can write it
2493 * out to failed devices later.
2495 if (parity_errors < rm->rm_firstdatacol - n ||
2496 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2497 n = raidz_parity_verify(zio, rm);
2498 unexpected_errors += n;
2499 ASSERT(parity_errors + n <=
2500 rm->rm_firstdatacol);
2509 * This isn't a typical situation -- either we got a read error or
2510 * a child silently returned bad data. Read every block so we can
2511 * try again with as much data and parity as we can track down. If
2512 * we've already been through once before, all children will be marked
2513 * as tried so we'll proceed to combinatorial reconstruction.
2515 unexpected_errors = 1;
2516 rm->rm_missingdata = 0;
2517 rm->rm_missingparity = 0;
2519 for (c = 0; c < rm->rm_cols; c++) {
2520 if (rm->rm_col[c].rc_tried)
2523 zio_vdev_io_redone(zio);
2525 rc = &rm->rm_col[c];
2528 zio_nowait(zio_vdev_child_io(zio, NULL,
2529 vd->vdev_child[rc->rc_devidx],
2530 rc->rc_offset, rc->rc_abd, rc->rc_size,
2531 zio->io_type, zio->io_priority, 0,
2532 vdev_raidz_child_done, rc));
2533 } while (++c < rm->rm_cols);
2539 * At this point we've attempted to reconstruct the data given the
2540 * errors we detected, and we've attempted to read all columns. There
2541 * must, therefore, be one or more additional problems -- silent errors
2542 * resulting in invalid data rather than explicit I/O errors resulting
2543 * in absent data. We check if there is enough additional data to
2544 * possibly reconstruct the data and then perform combinatorial
2545 * reconstruction over all possible combinations. If that fails,
2548 if (total_errors > rm->rm_firstdatacol) {
2549 zio->io_error = vdev_raidz_worst_error(rm);
2551 } else if (total_errors < rm->rm_firstdatacol &&
2552 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2554 * If we didn't use all the available parity for the
2555 * combinatorial reconstruction, verify that the remaining
2556 * parity is correct.
2558 if (code != (1 << rm->rm_firstdatacol) - 1)
2559 (void) raidz_parity_verify(zio, rm);
2562 * We're here because either:
2564 * total_errors == rm_first_datacol, or
2565 * vdev_raidz_combrec() failed
2567 * In either case, there is enough bad data to prevent
2570 * Start checksum ereports for all children which haven't
2571 * failed, and the IO wasn't speculative.
2573 zio->io_error = SET_ERROR(ECKSUM);
2575 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2576 for (c = 0; c < rm->rm_cols; c++) {
2577 rc = &rm->rm_col[c];
2578 if (rc->rc_error == 0) {
2579 zio_bad_cksum_t zbc;
2580 zbc.zbc_has_cksum = 0;
2582 rm->rm_ecksuminjected;
2584 zfs_ereport_start_checksum(
2586 vd->vdev_child[rc->rc_devidx],
2587 zio, rc->rc_offset, rc->rc_size,
2588 (void *)(uintptr_t)c, &zbc);
2595 zio_checksum_verified(zio);
2597 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2598 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2600 * Use the good data we have in hand to repair damaged children.
2602 for (c = 0; c < rm->rm_cols; c++) {
2603 rc = &rm->rm_col[c];
2604 cvd = vd->vdev_child[rc->rc_devidx];
2606 if (rc->rc_error == 0)
2609 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2610 rc->rc_offset, rc->rc_abd, rc->rc_size,
2611 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2612 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2613 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2619 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2621 if (faulted > vd->vdev_nparity)
2622 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2623 VDEV_AUX_NO_REPLICAS);
2624 else if (degraded + faulted != 0)
2625 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2627 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2631 * Determine if any portion of the provided block resides on a child vdev
2632 * with a dirty DTL and therefore needs to be resilvered. The function
2633 * assumes that at least one DTL is dirty which imples that full stripe
2634 * width blocks must be resilvered.
2637 vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
2639 uint64_t dcols = vd->vdev_children;
2640 uint64_t nparity = vd->vdev_nparity;
2641 uint64_t ashift = vd->vdev_top->vdev_ashift;
2642 /* The starting RAIDZ (parent) vdev sector of the block. */
2643 uint64_t b = offset >> ashift;
2644 /* The zio's size in units of the vdev's minimum sector size. */
2645 uint64_t s = ((psize - 1) >> ashift) + 1;
2646 /* The first column for this stripe. */
2647 uint64_t f = b % dcols;
2649 if (s + nparity >= dcols)
2652 for (uint64_t c = 0; c < s + nparity; c++) {
2653 uint64_t devidx = (f + c) % dcols;
2654 vdev_t *cvd = vd->vdev_child[devidx];
2657 * dsl_scan_need_resilver() already checked vd with
2658 * vdev_dtl_contains(). So here just check cvd with
2659 * vdev_dtl_empty(), cheaper and a good approximation.
2661 if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
2669 vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res)
2671 vdev_t *raidvd = cvd->vdev_parent;
2672 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
2674 uint64_t width = raidvd->vdev_children;
2675 uint64_t tgt_col = cvd->vdev_id;
2676 uint64_t ashift = raidvd->vdev_top->vdev_ashift;
2678 /* make sure the offsets are block-aligned */
2679 ASSERT0(in->rs_start % (1 << ashift));
2680 ASSERT0(in->rs_end % (1 << ashift));
2681 uint64_t b_start = in->rs_start >> ashift;
2682 uint64_t b_end = in->rs_end >> ashift;
2684 uint64_t start_row = 0;
2685 if (b_start > tgt_col) /* avoid underflow */
2686 start_row = ((b_start - tgt_col - 1) / width) + 1;
2688 uint64_t end_row = 0;
2689 if (b_end > tgt_col)
2690 end_row = ((b_end - tgt_col - 1) / width) + 1;
2692 res->rs_start = start_row << ashift;
2693 res->rs_end = end_row << ashift;
2695 ASSERT3U(res->rs_start, <=, in->rs_start);
2696 ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start);
2699 vdev_ops_t vdev_raidz_ops = {
2703 vdev_raidz_io_start,
2705 vdev_raidz_state_change,
2706 vdev_raidz_need_resilver,
2711 VDEV_TYPE_RAIDZ, /* name of this vdev type */
2712 B_FALSE /* not a leaf vdev */