]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
Please welcome ZFS - The last word in file systems.
[FreeBSD/FreeBSD.git] / sys / cddl / contrib / opensolaris / uts / common / fs / zfs / vdev_raidz.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26
27 #pragma ident   "%Z%%M% %I%     %E% SMI"
28
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/vdev_impl.h>
32 #include <sys/zio.h>
33 #include <sys/zio_checksum.h>
34 #include <sys/fs/zfs.h>
35 #include <sys/fm/fs/zfs.h>
36
37 /*
38  * Virtual device vector for RAID-Z.
39  *
40  * This vdev supports both single and double parity. For single parity, we
41  * use a simple XOR of all the data columns. For double parity, we use both
42  * the simple XOR as well as a technique described in "The mathematics of
43  * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
44  * over the integers expressable in a single byte. Briefly, the operations on
45  * the field are defined as follows:
46  *
47  *   o addition (+) is represented by a bitwise XOR
48  *   o subtraction (-) is therefore identical to addition: A + B = A - B
49  *   o multiplication of A by 2 is defined by the following bitwise expression:
50  *      (A * 2)_7 = A_6
51  *      (A * 2)_6 = A_5
52  *      (A * 2)_5 = A_4
53  *      (A * 2)_4 = A_3 + A_7
54  *      (A * 2)_3 = A_2 + A_7
55  *      (A * 2)_2 = A_1 + A_7
56  *      (A * 2)_1 = A_0
57  *      (A * 2)_0 = A_7
58  *
59  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
60  *
61  * Observe that any number in the field (except for 0) can be expressed as a
62  * power of 2 -- a generator for the field. We store a table of the powers of
63  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
64  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
65  * than field addition). The inverse of a field element A (A^-1) is A^254.
66  *
67  * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
68  * can be expressed by field operations:
69  *
70  *      P = D_0 + D_1 + ... + D_n-2 + D_n-1
71  *      Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
72  *        = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
73  *
74  * See the reconstruction code below for how P and Q can used individually or
75  * in concert to recover missing data columns.
76  */
77
78 typedef struct raidz_col {
79         uint64_t rc_devidx;             /* child device index for I/O */
80         uint64_t rc_offset;             /* device offset */
81         uint64_t rc_size;               /* I/O size */
82         void *rc_data;                  /* I/O data */
83         int rc_error;                   /* I/O error for this device */
84         uint8_t rc_tried;               /* Did we attempt this I/O column? */
85         uint8_t rc_skipped;             /* Did we skip this I/O column? */
86 } raidz_col_t;
87
88 typedef struct raidz_map {
89         uint64_t rm_cols;               /* Column count */
90         uint64_t rm_bigcols;            /* Number of oversized columns */
91         uint64_t rm_asize;              /* Actual total I/O size */
92         uint64_t rm_missingdata;        /* Count of missing data devices */
93         uint64_t rm_missingparity;      /* Count of missing parity devices */
94         uint64_t rm_firstdatacol;       /* First data column/parity count */
95         raidz_col_t rm_col[1];          /* Flexible array of I/O columns */
96 } raidz_map_t;
97
98 #define VDEV_RAIDZ_P            0
99 #define VDEV_RAIDZ_Q            1
100
101 #define VDEV_RAIDZ_MAXPARITY    2
102
103 #define VDEV_RAIDZ_MUL_2(a)     (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
104
105 /*
106  * These two tables represent powers and logs of 2 in the Galois field defined
107  * above. These values were computed by repeatedly multiplying by 2 as above.
108  */
109 static const uint8_t vdev_raidz_pow2[256] = {
110         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
111         0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
112         0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
113         0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
114         0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
115         0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
116         0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
117         0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
118         0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
119         0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
120         0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
121         0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
122         0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
123         0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
124         0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
125         0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
126         0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
127         0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
128         0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
129         0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
130         0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
131         0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
132         0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
133         0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
134         0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
135         0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
136         0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
137         0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
138         0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
139         0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
140         0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
141         0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
142 };
143 static const uint8_t vdev_raidz_log2[256] = {
144         0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
145         0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
146         0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
147         0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
148         0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
149         0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
150         0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
151         0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
152         0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
153         0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
154         0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
155         0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
156         0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
157         0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
158         0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
159         0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
160         0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
161         0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
162         0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
163         0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
164         0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
165         0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
166         0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
167         0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
168         0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
169         0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
170         0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
171         0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
172         0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
173         0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
174         0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
175         0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
176 };
177
178 /*
179  * Multiply a given number by 2 raised to the given power.
180  */
181 static uint8_t
182 vdev_raidz_exp2(uint_t a, int exp)
183 {
184         if (a == 0)
185                 return (0);
186
187         ASSERT(exp >= 0);
188         ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
189
190         exp += vdev_raidz_log2[a];
191         if (exp > 255)
192                 exp -= 255;
193
194         return (vdev_raidz_pow2[exp]);
195 }
196
197 static raidz_map_t *
198 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
199     uint64_t nparity)
200 {
201         raidz_map_t *rm;
202         uint64_t b = zio->io_offset >> unit_shift;
203         uint64_t s = zio->io_size >> unit_shift;
204         uint64_t f = b % dcols;
205         uint64_t o = (b / dcols) << unit_shift;
206         uint64_t q, r, c, bc, col, acols, coff, devidx;
207
208         q = s / (dcols - nparity);
209         r = s - q * (dcols - nparity);
210         bc = (r == 0 ? 0 : r + nparity);
211
212         acols = (q == 0 ? bc : dcols);
213
214         rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
215
216         rm->rm_cols = acols;
217         rm->rm_bigcols = bc;
218         rm->rm_asize = 0;
219         rm->rm_missingdata = 0;
220         rm->rm_missingparity = 0;
221         rm->rm_firstdatacol = nparity;
222
223         for (c = 0; c < acols; c++) {
224                 col = f + c;
225                 coff = o;
226                 if (col >= dcols) {
227                         col -= dcols;
228                         coff += 1ULL << unit_shift;
229                 }
230                 rm->rm_col[c].rc_devidx = col;
231                 rm->rm_col[c].rc_offset = coff;
232                 rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
233                 rm->rm_col[c].rc_data = NULL;
234                 rm->rm_col[c].rc_error = 0;
235                 rm->rm_col[c].rc_tried = 0;
236                 rm->rm_col[c].rc_skipped = 0;
237                 rm->rm_asize += rm->rm_col[c].rc_size;
238         }
239
240         rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
241
242         for (c = 0; c < rm->rm_firstdatacol; c++)
243                 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
244
245         rm->rm_col[c].rc_data = zio->io_data;
246
247         for (c = c + 1; c < acols; c++)
248                 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
249                     rm->rm_col[c - 1].rc_size;
250
251         /*
252          * If all data stored spans all columns, there's a danger that parity
253          * will always be on the same device and, since parity isn't read
254          * during normal operation, that that device's I/O bandwidth won't be
255          * used effectively. We therefore switch the parity every 1MB.
256          *
257          * ... at least that was, ostensibly, the theory. As a practical
258          * matter unless we juggle the parity between all devices evenly, we
259          * won't see any benefit. Further, occasional writes that aren't a
260          * multiple of the LCM of the number of children and the minimum
261          * stripe width are sufficient to avoid pessimal behavior.
262          * Unfortunately, this decision created an implicit on-disk format
263          * requirement that we need to support for all eternity, but only
264          * for single-parity RAID-Z.
265          */
266         ASSERT(rm->rm_cols >= 2);
267         ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
268
269         if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
270                 devidx = rm->rm_col[0].rc_devidx;
271                 o = rm->rm_col[0].rc_offset;
272                 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
273                 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
274                 rm->rm_col[1].rc_devidx = devidx;
275                 rm->rm_col[1].rc_offset = o;
276         }
277
278         zio->io_vsd = rm;
279         return (rm);
280 }
281
282 static void
283 vdev_raidz_map_free(zio_t *zio)
284 {
285         raidz_map_t *rm = zio->io_vsd;
286         int c;
287
288         for (c = 0; c < rm->rm_firstdatacol; c++)
289                 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
290
291         kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
292         zio->io_vsd = NULL;
293 }
294
295 static void
296 vdev_raidz_generate_parity_p(raidz_map_t *rm)
297 {
298         uint64_t *p, *src, pcount, ccount, i;
299         int c;
300
301         pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
302
303         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
304                 src = rm->rm_col[c].rc_data;
305                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
306                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
307
308                 if (c == rm->rm_firstdatacol) {
309                         ASSERT(ccount == pcount);
310                         for (i = 0; i < ccount; i++, p++, src++) {
311                                 *p = *src;
312                         }
313                 } else {
314                         ASSERT(ccount <= pcount);
315                         for (i = 0; i < ccount; i++, p++, src++) {
316                                 *p ^= *src;
317                         }
318                 }
319         }
320 }
321
322 static void
323 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
324 {
325         uint64_t *q, *p, *src, pcount, ccount, mask, i;
326         int c;
327
328         pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
329         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
330             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
331
332         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
333                 src = rm->rm_col[c].rc_data;
334                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
335                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
336                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
337
338                 if (c == rm->rm_firstdatacol) {
339                         ASSERT(ccount == pcount || ccount == 0);
340                         for (i = 0; i < ccount; i++, p++, q++, src++) {
341                                 *q = *src;
342                                 *p = *src;
343                         }
344                         for (; i < pcount; i++, p++, q++, src++) {
345                                 *q = 0;
346                                 *p = 0;
347                         }
348                 } else {
349                         ASSERT(ccount <= pcount);
350
351                         /*
352                          * Rather than multiplying each byte individually (as
353                          * described above), we are able to handle 8 at once
354                          * by generating a mask based on the high bit in each
355                          * byte and using that to conditionally XOR in 0x1d.
356                          */
357                         for (i = 0; i < ccount; i++, p++, q++, src++) {
358                                 mask = *q & 0x8080808080808080ULL;
359                                 mask = (mask << 1) - (mask >> 7);
360                                 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
361                                     (mask & 0x1d1d1d1d1d1d1d1dULL);
362                                 *q ^= *src;
363                                 *p ^= *src;
364                         }
365
366                         /*
367                          * Treat short columns as though they are full of 0s.
368                          */
369                         for (; i < pcount; i++, q++) {
370                                 mask = *q & 0x8080808080808080ULL;
371                                 mask = (mask << 1) - (mask >> 7);
372                                 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
373                                     (mask & 0x1d1d1d1d1d1d1d1dULL);
374                         }
375                 }
376         }
377 }
378
379 static void
380 vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
381 {
382         uint64_t *dst, *src, xcount, ccount, count, i;
383         int c;
384
385         xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
386         ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
387         ASSERT(xcount > 0);
388
389         src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
390         dst = rm->rm_col[x].rc_data;
391         for (i = 0; i < xcount; i++, dst++, src++) {
392                 *dst = *src;
393         }
394
395         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
396                 src = rm->rm_col[c].rc_data;
397                 dst = rm->rm_col[x].rc_data;
398
399                 if (c == x)
400                         continue;
401
402                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
403                 count = MIN(ccount, xcount);
404
405                 for (i = 0; i < count; i++, dst++, src++) {
406                         *dst ^= *src;
407                 }
408         }
409 }
410
411 static void
412 vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
413 {
414         uint64_t *dst, *src, xcount, ccount, count, mask, i;
415         uint8_t *b;
416         int c, j, exp;
417
418         xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
419         ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
420
421         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
422                 src = rm->rm_col[c].rc_data;
423                 dst = rm->rm_col[x].rc_data;
424
425                 if (c == x)
426                         ccount = 0;
427                 else
428                         ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
429
430                 count = MIN(ccount, xcount);
431
432                 if (c == rm->rm_firstdatacol) {
433                         for (i = 0; i < count; i++, dst++, src++) {
434                                 *dst = *src;
435                         }
436                         for (; i < xcount; i++, dst++) {
437                                 *dst = 0;
438                         }
439
440                 } else {
441                         /*
442                          * For an explanation of this, see the comment in
443                          * vdev_raidz_generate_parity_pq() above.
444                          */
445                         for (i = 0; i < count; i++, dst++, src++) {
446                                 mask = *dst & 0x8080808080808080ULL;
447                                 mask = (mask << 1) - (mask >> 7);
448                                 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
449                                     (mask & 0x1d1d1d1d1d1d1d1dULL);
450                                 *dst ^= *src;
451                         }
452
453                         for (; i < xcount; i++, dst++) {
454                                 mask = *dst & 0x8080808080808080ULL;
455                                 mask = (mask << 1) - (mask >> 7);
456                                 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
457                                     (mask & 0x1d1d1d1d1d1d1d1dULL);
458                         }
459                 }
460         }
461
462         src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
463         dst = rm->rm_col[x].rc_data;
464         exp = 255 - (rm->rm_cols - 1 - x);
465
466         for (i = 0; i < xcount; i++, dst++, src++) {
467                 *dst ^= *src;
468                 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
469                         *b = vdev_raidz_exp2(*b, exp);
470                 }
471         }
472 }
473
474 static void
475 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
476 {
477         uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
478         void *pdata, *qdata;
479         uint64_t xsize, ysize, i;
480
481         ASSERT(x < y);
482         ASSERT(x >= rm->rm_firstdatacol);
483         ASSERT(y < rm->rm_cols);
484
485         ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
486
487         /*
488          * Move the parity data aside -- we're going to compute parity as
489          * though columns x and y were full of zeros -- Pxy and Qxy. We want to
490          * reuse the parity generation mechanism without trashing the actual
491          * parity so we make those columns appear to be full of zeros by
492          * setting their lengths to zero.
493          */
494         pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
495         qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
496         xsize = rm->rm_col[x].rc_size;
497         ysize = rm->rm_col[y].rc_size;
498
499         rm->rm_col[VDEV_RAIDZ_P].rc_data =
500             zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
501         rm->rm_col[VDEV_RAIDZ_Q].rc_data =
502             zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
503         rm->rm_col[x].rc_size = 0;
504         rm->rm_col[y].rc_size = 0;
505
506         vdev_raidz_generate_parity_pq(rm);
507
508         rm->rm_col[x].rc_size = xsize;
509         rm->rm_col[y].rc_size = ysize;
510
511         p = pdata;
512         q = qdata;
513         pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
514         qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
515         xd = rm->rm_col[x].rc_data;
516         yd = rm->rm_col[y].rc_data;
517
518         /*
519          * We now have:
520          *      Pxy = P + D_x + D_y
521          *      Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
522          *
523          * We can then solve for D_x:
524          *      D_x = A * (P + Pxy) + B * (Q + Qxy)
525          * where
526          *      A = 2^(x - y) * (2^(x - y) + 1)^-1
527          *      B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
528          *
529          * With D_x in hand, we can easily solve for D_y:
530          *      D_y = P + Pxy + D_x
531          */
532
533         a = vdev_raidz_pow2[255 + x - y];
534         b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
535         tmp = 255 - vdev_raidz_log2[a ^ 1];
536
537         aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
538         bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
539
540         for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
541                 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
542                     vdev_raidz_exp2(*q ^ *qxy, bexp);
543
544                 if (i < ysize)
545                         *yd = *p ^ *pxy ^ *xd;
546         }
547
548         zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
549             rm->rm_col[VDEV_RAIDZ_P].rc_size);
550         zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
551             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
552
553         /*
554          * Restore the saved parity data.
555          */
556         rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
557         rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
558 }
559
560
561 static int
562 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
563 {
564         vdev_t *cvd;
565         uint64_t nparity = vd->vdev_nparity;
566         int c, error;
567         int lasterror = 0;
568         int numerrors = 0;
569
570         ASSERT(nparity > 0);
571
572         if (nparity > VDEV_RAIDZ_MAXPARITY ||
573             vd->vdev_children < nparity + 1) {
574                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
575                 return (EINVAL);
576         }
577
578         for (c = 0; c < vd->vdev_children; c++) {
579                 cvd = vd->vdev_child[c];
580
581                 if ((error = vdev_open(cvd)) != 0) {
582                         lasterror = error;
583                         numerrors++;
584                         continue;
585                 }
586
587                 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
588                 *ashift = MAX(*ashift, cvd->vdev_ashift);
589         }
590
591         *asize *= vd->vdev_children;
592
593         if (numerrors > nparity) {
594                 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
595                 return (lasterror);
596         }
597
598         return (0);
599 }
600
601 static void
602 vdev_raidz_close(vdev_t *vd)
603 {
604         int c;
605
606         for (c = 0; c < vd->vdev_children; c++)
607                 vdev_close(vd->vdev_child[c]);
608 }
609
610 static uint64_t
611 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
612 {
613         uint64_t asize;
614         uint64_t ashift = vd->vdev_top->vdev_ashift;
615         uint64_t cols = vd->vdev_children;
616         uint64_t nparity = vd->vdev_nparity;
617
618         asize = ((psize - 1) >> ashift) + 1;
619         asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
620         asize = roundup(asize, nparity + 1) << ashift;
621
622         return (asize);
623 }
624
625 static void
626 vdev_raidz_child_done(zio_t *zio)
627 {
628         raidz_col_t *rc = zio->io_private;
629
630         rc->rc_error = zio->io_error;
631         rc->rc_tried = 1;
632         rc->rc_skipped = 0;
633 }
634
635 static void
636 vdev_raidz_repair_done(zio_t *zio)
637 {
638         ASSERT(zio->io_private == zio->io_parent);
639         vdev_raidz_map_free(zio->io_private);
640 }
641
642 static void
643 vdev_raidz_io_start(zio_t *zio)
644 {
645         vdev_t *vd = zio->io_vd;
646         vdev_t *tvd = vd->vdev_top;
647         vdev_t *cvd;
648         blkptr_t *bp = zio->io_bp;
649         raidz_map_t *rm;
650         raidz_col_t *rc;
651         int c;
652
653         rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
654             vd->vdev_nparity);
655
656         ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
657
658         if (zio->io_type == ZIO_TYPE_WRITE) {
659                 /*
660                  * Generate RAID parity in the first virtual columns.
661                  */
662                 if (rm->rm_firstdatacol == 1)
663                         vdev_raidz_generate_parity_p(rm);
664                 else
665                         vdev_raidz_generate_parity_pq(rm);
666
667                 for (c = 0; c < rm->rm_cols; c++) {
668                         rc = &rm->rm_col[c];
669                         cvd = vd->vdev_child[rc->rc_devidx];
670                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
671                             rc->rc_offset, rc->rc_data, rc->rc_size,
672                             zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
673                             vdev_raidz_child_done, rc));
674                 }
675                 zio_wait_children_done(zio);
676                 return;
677         }
678
679         ASSERT(zio->io_type == ZIO_TYPE_READ);
680
681         /*
682          * Iterate over the columns in reverse order so that we hit the parity
683          * last -- any errors along the way will force us to read the parity
684          * data.
685          */
686         for (c = rm->rm_cols - 1; c >= 0; c--) {
687                 rc = &rm->rm_col[c];
688                 cvd = vd->vdev_child[rc->rc_devidx];
689                 if (vdev_is_dead(cvd)) {
690                         if (c >= rm->rm_firstdatacol)
691                                 rm->rm_missingdata++;
692                         else
693                                 rm->rm_missingparity++;
694                         rc->rc_error = ENXIO;
695                         rc->rc_tried = 1;       /* don't even try */
696                         rc->rc_skipped = 1;
697                         continue;
698                 }
699                 if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
700                         if (c >= rm->rm_firstdatacol)
701                                 rm->rm_missingdata++;
702                         else
703                                 rm->rm_missingparity++;
704                         rc->rc_error = ESTALE;
705                         rc->rc_skipped = 1;
706                         continue;
707                 }
708                 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
709                     (zio->io_flags & ZIO_FLAG_SCRUB)) {
710                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
711                             rc->rc_offset, rc->rc_data, rc->rc_size,
712                             zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
713                             vdev_raidz_child_done, rc));
714                 }
715         }
716
717         zio_wait_children_done(zio);
718 }
719
720 /*
721  * Report a checksum error for a child of a RAID-Z device.
722  */
723 static void
724 raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
725 {
726         vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
727         dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
728             vdev_description(vd));
729
730         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
731                 mutex_enter(&vd->vdev_stat_lock);
732                 vd->vdev_stat.vs_checksum_errors++;
733                 mutex_exit(&vd->vdev_stat_lock);
734         }
735
736         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
737                 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
738                     zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
739 }
740
741 /*
742  * Generate the parity from the data columns. If we tried and were able to
743  * read the parity without error, verify that the generated parity matches the
744  * data we read. If it doesn't, we fire off a checksum error. Return the
745  * number such failures.
746  */
747 static int
748 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
749 {
750         void *orig[VDEV_RAIDZ_MAXPARITY];
751         int c, ret = 0;
752         raidz_col_t *rc;
753
754         for (c = 0; c < rm->rm_firstdatacol; c++) {
755                 rc = &rm->rm_col[c];
756                 if (!rc->rc_tried || rc->rc_error != 0)
757                         continue;
758                 orig[c] = zio_buf_alloc(rc->rc_size);
759                 bcopy(rc->rc_data, orig[c], rc->rc_size);
760         }
761
762         if (rm->rm_firstdatacol == 1)
763                 vdev_raidz_generate_parity_p(rm);
764         else
765                 vdev_raidz_generate_parity_pq(rm);
766
767         for (c = 0; c < rm->rm_firstdatacol; c++) {
768                 rc = &rm->rm_col[c];
769                 if (!rc->rc_tried || rc->rc_error != 0)
770                         continue;
771                 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
772                         raidz_checksum_error(zio, rc);
773                         rc->rc_error = ECKSUM;
774                         ret++;
775                 }
776                 zio_buf_free(orig[c], rc->rc_size);
777         }
778
779         return (ret);
780 }
781
782 static uint64_t raidz_corrected_p;
783 static uint64_t raidz_corrected_q;
784 static uint64_t raidz_corrected_pq;
785
786 static void
787 vdev_raidz_io_done(zio_t *zio)
788 {
789         vdev_t *vd = zio->io_vd;
790         vdev_t *cvd;
791         raidz_map_t *rm = zio->io_vsd;
792         raidz_col_t *rc, *rc1;
793         int unexpected_errors = 0;
794         int parity_errors = 0;
795         int parity_untried = 0;
796         int data_errors = 0;
797         int n, c, c1;
798
799         ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
800
801         zio->io_error = 0;
802         zio->io_numerrors = 0;
803
804         ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
805         ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
806
807         for (c = 0; c < rm->rm_cols; c++) {
808                 rc = &rm->rm_col[c];
809
810                 /*
811                  * We preserve any EIOs because those may be worth retrying;
812                  * whereas ECKSUM and ENXIO are more likely to be persistent.
813                  */
814                 if (rc->rc_error) {
815                         if (zio->io_error != EIO)
816                                 zio->io_error = rc->rc_error;
817
818                         if (c < rm->rm_firstdatacol)
819                                 parity_errors++;
820                         else
821                                 data_errors++;
822
823                         if (!rc->rc_skipped)
824                                 unexpected_errors++;
825
826                         zio->io_numerrors++;
827                 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
828                         parity_untried++;
829                 }
830         }
831
832         if (zio->io_type == ZIO_TYPE_WRITE) {
833                 /*
834                  * If this is not a failfast write, and we were able to
835                  * write enough columns to reconstruct the data, good enough.
836                  */
837                 /* XXPOLICY */
838                 if (zio->io_numerrors <= rm->rm_firstdatacol &&
839                     !(zio->io_flags & ZIO_FLAG_FAILFAST))
840                         zio->io_error = 0;
841
842                 vdev_raidz_map_free(zio);
843                 zio_next_stage(zio);
844                 return;
845         }
846
847         ASSERT(zio->io_type == ZIO_TYPE_READ);
848         /*
849          * There are three potential phases for a read:
850          *      1. produce valid data from the columns read
851          *      2. read all disks and try again
852          *      3. perform combinatorial reconstruction
853          *
854          * Each phase is progressively both more expensive and less likely to
855          * occur. If we encounter more errors than we can repair or all phases
856          * fail, we have no choice but to return an error.
857          */
858
859         /*
860          * If the number of errors we saw was correctable -- less than or equal
861          * to the number of parity disks read -- attempt to produce data that
862          * has a valid checksum. Naturally, this case applies in the absence of
863          * any errors.
864          */
865         if (zio->io_numerrors <= rm->rm_firstdatacol - parity_untried) {
866                 switch (data_errors) {
867                 case 0:
868                         if (zio_checksum_error(zio) == 0) {
869                                 zio->io_error = 0;
870                                 if (parity_errors + parity_untried <
871                                     rm->rm_firstdatacol) {
872                                         n = raidz_parity_verify(zio, rm);
873                                         unexpected_errors += n;
874                                         ASSERT(parity_errors + n <=
875                                             rm->rm_firstdatacol);
876                                 }
877                                 goto done;
878                         }
879                         break;
880
881                 case 1:
882                         /*
883                          * We either attempt to read all the parity columns or
884                          * none of them. If we didn't try to read parity, we
885                          * wouldn't be here in the correctable case. There must
886                          * also have been fewer parity errors than parity
887                          * columns or, again, we wouldn't be in this code path.
888                          */
889                         ASSERT(parity_untried == 0);
890                         ASSERT(parity_errors < rm->rm_firstdatacol);
891
892                         /*
893                          * Find the column that reported the error.
894                          */
895                         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
896                                 rc = &rm->rm_col[c];
897                                 if (rc->rc_error != 0)
898                                         break;
899                         }
900                         ASSERT(c != rm->rm_cols);
901                         ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
902                             rc->rc_error == ESTALE);
903
904                         if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
905                                 vdev_raidz_reconstruct_p(rm, c);
906                         } else {
907                                 ASSERT(rm->rm_firstdatacol > 1);
908                                 vdev_raidz_reconstruct_q(rm, c);
909                         }
910
911                         if (zio_checksum_error(zio) == 0) {
912                                 zio->io_error = 0;
913                                 if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
914                                         atomic_inc_64(&raidz_corrected_p);
915                                 else
916                                         atomic_inc_64(&raidz_corrected_q);
917
918                                 /*
919                                  * If there's more than one parity disk that
920                                  * was successfully read, confirm that the
921                                  * other parity disk produced the correct data.
922                                  * This routine is suboptimal in that it
923                                  * regenerates both the parity we wish to test
924                                  * as well as the parity we just used to
925                                  * perform the reconstruction, but this should
926                                  * be a relatively uncommon case, and can be
927                                  * optimized if it becomes a problem.
928                                  */
929                                 if (parity_errors < rm->rm_firstdatacol - 1) {
930                                         n = raidz_parity_verify(zio, rm);
931                                         unexpected_errors += n;
932                                         ASSERT(parity_errors + n <=
933                                             rm->rm_firstdatacol);
934                                 }
935
936                                 goto done;
937                         }
938                         break;
939
940                 case 2:
941                         /*
942                          * Two data column errors require double parity.
943                          */
944                         ASSERT(rm->rm_firstdatacol == 2);
945
946                         /*
947                          * Find the two columns that reported errors.
948                          */
949                         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
950                                 rc = &rm->rm_col[c];
951                                 if (rc->rc_error != 0)
952                                         break;
953                         }
954                         ASSERT(c != rm->rm_cols);
955                         ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
956                             rc->rc_error == ESTALE);
957
958                         for (c1 = c++; c < rm->rm_cols; c++) {
959                                 rc = &rm->rm_col[c];
960                                 if (rc->rc_error != 0)
961                                         break;
962                         }
963                         ASSERT(c != rm->rm_cols);
964                         ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
965                             rc->rc_error == ESTALE);
966
967                         vdev_raidz_reconstruct_pq(rm, c1, c);
968
969                         if (zio_checksum_error(zio) == 0) {
970                                 zio->io_error = 0;
971                                 atomic_inc_64(&raidz_corrected_pq);
972
973                                 goto done;
974                         }
975                         break;
976
977                 default:
978                         ASSERT(rm->rm_firstdatacol <= 2);
979                         ASSERT(0);
980                 }
981         }
982
983         /*
984          * This isn't a typical situation -- either we got a read error or
985          * a child silently returned bad data. Read every block so we can
986          * try again with as much data and parity as we can track down. If
987          * we've already been through once before, all children will be marked
988          * as tried so we'll proceed to combinatorial reconstruction.
989          */
990         unexpected_errors = 1;
991         rm->rm_missingdata = 0;
992         rm->rm_missingparity = 0;
993
994         for (c = 0; c < rm->rm_cols; c++) {
995                 if (rm->rm_col[c].rc_tried)
996                         continue;
997
998                 zio->io_error = 0;
999                 zio_vdev_io_redone(zio);
1000                 do {
1001                         rc = &rm->rm_col[c];
1002                         if (rc->rc_tried)
1003                                 continue;
1004                         zio_nowait(zio_vdev_child_io(zio, NULL,
1005                             vd->vdev_child[rc->rc_devidx],
1006                             rc->rc_offset, rc->rc_data, rc->rc_size,
1007                             zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
1008                             vdev_raidz_child_done, rc));
1009                 } while (++c < rm->rm_cols);
1010                 dprintf("rereading\n");
1011                 zio_wait_children_done(zio);
1012                 return;
1013         }
1014
1015         /*
1016          * At this point we've attempted to reconstruct the data given the
1017          * errors we detected, and we've attempted to read all columns. There
1018          * must, therefore, be one or more additional problems -- silent errors
1019          * resulting in invalid data rather than explicit I/O errors resulting
1020          * in absent data. Before we attempt combinatorial reconstruction make
1021          * sure we have a chance of coming up with the right answer.
1022          */
1023         if (zio->io_numerrors >= rm->rm_firstdatacol) {
1024                 ASSERT(zio->io_error != 0);
1025                 goto done;
1026         }
1027
1028         if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
1029                 /*
1030                  * Attempt to reconstruct the data from parity P.
1031                  */
1032                 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1033                         void *orig;
1034                         rc = &rm->rm_col[c];
1035
1036                         orig = zio_buf_alloc(rc->rc_size);
1037                         bcopy(rc->rc_data, orig, rc->rc_size);
1038                         vdev_raidz_reconstruct_p(rm, c);
1039
1040                         if (zio_checksum_error(zio) == 0) {
1041                                 zio_buf_free(orig, rc->rc_size);
1042                                 zio->io_error = 0;
1043                                 atomic_inc_64(&raidz_corrected_p);
1044
1045                                 /*
1046                                  * If this child didn't know that it returned
1047                                  * bad data, inform it.
1048                                  */
1049                                 if (rc->rc_tried && rc->rc_error == 0)
1050                                         raidz_checksum_error(zio, rc);
1051                                 rc->rc_error = ECKSUM;
1052                                 goto done;
1053                         }
1054
1055                         bcopy(orig, rc->rc_data, rc->rc_size);
1056                         zio_buf_free(orig, rc->rc_size);
1057                 }
1058         }
1059
1060         if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
1061                 /*
1062                  * Attempt to reconstruct the data from parity Q.
1063                  */
1064                 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1065                         void *orig;
1066                         rc = &rm->rm_col[c];
1067
1068                         orig = zio_buf_alloc(rc->rc_size);
1069                         bcopy(rc->rc_data, orig, rc->rc_size);
1070                         vdev_raidz_reconstruct_q(rm, c);
1071
1072                         if (zio_checksum_error(zio) == 0) {
1073                                 zio_buf_free(orig, rc->rc_size);
1074                                 zio->io_error = 0;
1075                                 atomic_inc_64(&raidz_corrected_q);
1076
1077                                 /*
1078                                  * If this child didn't know that it returned
1079                                  * bad data, inform it.
1080                                  */
1081                                 if (rc->rc_tried && rc->rc_error == 0)
1082                                         raidz_checksum_error(zio, rc);
1083                                 rc->rc_error = ECKSUM;
1084                                 goto done;
1085                         }
1086
1087                         bcopy(orig, rc->rc_data, rc->rc_size);
1088                         zio_buf_free(orig, rc->rc_size);
1089                 }
1090         }
1091
1092         if (rm->rm_firstdatacol > 1 &&
1093             rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
1094             rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
1095                 /*
1096                  * Attempt to reconstruct the data from both P and Q.
1097                  */
1098                 for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
1099                         void *orig, *orig1;
1100                         rc = &rm->rm_col[c];
1101
1102                         orig = zio_buf_alloc(rc->rc_size);
1103                         bcopy(rc->rc_data, orig, rc->rc_size);
1104
1105                         for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
1106                                 rc1 = &rm->rm_col[c1];
1107
1108                                 orig1 = zio_buf_alloc(rc1->rc_size);
1109                                 bcopy(rc1->rc_data, orig1, rc1->rc_size);
1110
1111                                 vdev_raidz_reconstruct_pq(rm, c, c1);
1112
1113                                 if (zio_checksum_error(zio) == 0) {
1114                                         zio_buf_free(orig, rc->rc_size);
1115                                         zio_buf_free(orig1, rc1->rc_size);
1116                                         zio->io_error = 0;
1117                                         atomic_inc_64(&raidz_corrected_pq);
1118
1119                                         /*
1120                                          * If these children didn't know they
1121                                          * returned bad data, inform them.
1122                                          */
1123                                         if (rc->rc_tried && rc->rc_error == 0)
1124                                                 raidz_checksum_error(zio, rc);
1125                                         if (rc1->rc_tried && rc1->rc_error == 0)
1126                                                 raidz_checksum_error(zio, rc1);
1127
1128                                         rc->rc_error = ECKSUM;
1129                                         rc1->rc_error = ECKSUM;
1130
1131                                         goto done;
1132                                 }
1133
1134                                 bcopy(orig1, rc1->rc_data, rc1->rc_size);
1135                                 zio_buf_free(orig1, rc1->rc_size);
1136                         }
1137
1138                         bcopy(orig, rc->rc_data, rc->rc_size);
1139                         zio_buf_free(orig, rc->rc_size);
1140                 }
1141         }
1142
1143         /*
1144          * All combinations failed to checksum. Generate checksum ereports for
1145          * all children.
1146          */
1147         zio->io_error = ECKSUM;
1148         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1149                 for (c = 0; c < rm->rm_cols; c++) {
1150                         rc = &rm->rm_col[c];
1151                         zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
1152                             zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
1153                             rc->rc_offset, rc->rc_size);
1154                 }
1155         }
1156
1157 done:
1158         zio_checksum_verified(zio);
1159
1160         if (zio->io_error == 0 && (spa_mode & FWRITE) &&
1161             (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
1162                 zio_t *rio;
1163
1164                 /*
1165                  * Use the good data we have in hand to repair damaged children.
1166                  *
1167                  * We issue all repair I/Os as children of 'rio' to arrange
1168                  * that vdev_raidz_map_free(zio) will be invoked after all
1169                  * repairs complete, but before we advance to the next stage.
1170                  */
1171                 rio = zio_null(zio, zio->io_spa,
1172                     vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL);
1173
1174                 for (c = 0; c < rm->rm_cols; c++) {
1175                         rc = &rm->rm_col[c];
1176                         cvd = vd->vdev_child[rc->rc_devidx];
1177
1178                         if (rc->rc_error == 0)
1179                                 continue;
1180
1181                         dprintf("%s resilvered %s @ 0x%llx error %d\n",
1182                             vdev_description(vd),
1183                             vdev_description(cvd),
1184                             zio->io_offset, rc->rc_error);
1185
1186                         zio_nowait(zio_vdev_child_io(rio, NULL, cvd,
1187                             rc->rc_offset, rc->rc_data, rc->rc_size,
1188                             ZIO_TYPE_WRITE, zio->io_priority,
1189                             ZIO_FLAG_IO_REPAIR | ZIO_FLAG_DONT_PROPAGATE |
1190                             ZIO_FLAG_CANFAIL, NULL, NULL));
1191                 }
1192
1193                 zio_nowait(rio);
1194                 zio_wait_children_done(zio);
1195                 return;
1196         }
1197
1198         vdev_raidz_map_free(zio);
1199         zio_next_stage(zio);
1200 }
1201
1202 static void
1203 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
1204 {
1205         if (faulted > vd->vdev_nparity)
1206                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1207                     VDEV_AUX_NO_REPLICAS);
1208         else if (degraded + faulted != 0)
1209                 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
1210         else
1211                 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
1212 }
1213
1214 vdev_ops_t vdev_raidz_ops = {
1215         vdev_raidz_open,
1216         vdev_raidz_close,
1217         vdev_raidz_asize,
1218         vdev_raidz_io_start,
1219         vdev_raidz_io_done,
1220         vdev_raidz_state_change,
1221         VDEV_TYPE_RAIDZ,        /* name of this vdev type */
1222         B_FALSE                 /* not a leaf vdev */
1223 };