]> CyberLeo.Net >> Repos - FreeBSD/releng/8.1.git/blob - sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
Copy stable/8 to releng/8.1 in preparation for 8.1-RC1.
[FreeBSD/releng/8.1.git] / sys / cddl / contrib / opensolaris / uts / common / fs / zfs / vdev_raidz.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26
27 #include <sys/zfs_context.h>
28 #include <sys/spa.h>
29 #include <sys/vdev_impl.h>
30 #include <sys/zio.h>
31 #include <sys/zio_checksum.h>
32 #include <sys/fs/zfs.h>
33 #include <sys/fm/fs/zfs.h>
34
35 /*
36  * Virtual device vector for RAID-Z.
37  *
38  * This vdev supports both single and double parity. For single parity, we
39  * use a simple XOR of all the data columns. For double parity, we use both
40  * the simple XOR as well as a technique described in "The mathematics of
41  * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
42  * over the integers expressable in a single byte. Briefly, the operations on
43  * the field are defined as follows:
44  *
45  *   o addition (+) is represented by a bitwise XOR
46  *   o subtraction (-) is therefore identical to addition: A + B = A - B
47  *   o multiplication of A by 2 is defined by the following bitwise expression:
48  *      (A * 2)_7 = A_6
49  *      (A * 2)_6 = A_5
50  *      (A * 2)_5 = A_4
51  *      (A * 2)_4 = A_3 + A_7
52  *      (A * 2)_3 = A_2 + A_7
53  *      (A * 2)_2 = A_1 + A_7
54  *      (A * 2)_1 = A_0
55  *      (A * 2)_0 = A_7
56  *
57  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
58  *
59  * Observe that any number in the field (except for 0) can be expressed as a
60  * power of 2 -- a generator for the field. We store a table of the powers of
61  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
62  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
63  * than field addition). The inverse of a field element A (A^-1) is A^254.
64  *
65  * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
66  * can be expressed by field operations:
67  *
68  *      P = D_0 + D_1 + ... + D_n-2 + D_n-1
69  *      Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
70  *        = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
71  *
72  * See the reconstruction code below for how P and Q can used individually or
73  * in concert to recover missing data columns.
74  */
75
76 typedef struct raidz_col {
77         uint64_t rc_devidx;             /* child device index for I/O */
78         uint64_t rc_offset;             /* device offset */
79         uint64_t rc_size;               /* I/O size */
80         void *rc_data;                  /* I/O data */
81         int rc_error;                   /* I/O error for this device */
82         uint8_t rc_tried;               /* Did we attempt this I/O column? */
83         uint8_t rc_skipped;             /* Did we skip this I/O column? */
84 } raidz_col_t;
85
86 typedef struct raidz_map {
87         uint64_t rm_cols;               /* Column count */
88         uint64_t rm_bigcols;            /* Number of oversized columns */
89         uint64_t rm_asize;              /* Actual total I/O size */
90         uint64_t rm_missingdata;        /* Count of missing data devices */
91         uint64_t rm_missingparity;      /* Count of missing parity devices */
92         uint64_t rm_firstdatacol;       /* First data column/parity count */
93         raidz_col_t rm_col[1];          /* Flexible array of I/O columns */
94 } raidz_map_t;
95
96 #define VDEV_RAIDZ_P            0
97 #define VDEV_RAIDZ_Q            1
98
99 #define VDEV_RAIDZ_MAXPARITY    2
100
101 #define VDEV_RAIDZ_MUL_2(a)     (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
102
103 /*
104  * These two tables represent powers and logs of 2 in the Galois field defined
105  * above. These values were computed by repeatedly multiplying by 2 as above.
106  */
107 static const uint8_t vdev_raidz_pow2[256] = {
108         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
109         0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
110         0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
111         0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
112         0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
113         0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
114         0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
115         0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
116         0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
117         0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
118         0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
119         0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
120         0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
121         0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
122         0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
123         0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
124         0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
125         0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
126         0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
127         0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
128         0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
129         0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
130         0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
131         0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
132         0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
133         0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
134         0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
135         0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
136         0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
137         0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
138         0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
139         0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
140 };
141 static const uint8_t vdev_raidz_log2[256] = {
142         0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
143         0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
144         0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
145         0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
146         0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
147         0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
148         0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
149         0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
150         0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
151         0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
152         0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
153         0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
154         0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
155         0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
156         0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
157         0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
158         0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
159         0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
160         0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
161         0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
162         0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
163         0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
164         0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
165         0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
166         0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
167         0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
168         0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
169         0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
170         0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
171         0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
172         0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
173         0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
174 };
175
176 /*
177  * Multiply a given number by 2 raised to the given power.
178  */
179 static uint8_t
180 vdev_raidz_exp2(uint_t a, int exp)
181 {
182         if (a == 0)
183                 return (0);
184
185         ASSERT(exp >= 0);
186         ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
187
188         exp += vdev_raidz_log2[a];
189         if (exp > 255)
190                 exp -= 255;
191
192         return (vdev_raidz_pow2[exp]);
193 }
194
195 static void
196 vdev_raidz_map_free(zio_t *zio)
197 {
198         raidz_map_t *rm = zio->io_vsd;
199         int c;
200
201         for (c = 0; c < rm->rm_firstdatacol; c++)
202                 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
203
204         kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
205 }
206
207 static raidz_map_t *
208 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
209     uint64_t nparity)
210 {
211         raidz_map_t *rm;
212         uint64_t b = zio->io_offset >> unit_shift;
213         uint64_t s = zio->io_size >> unit_shift;
214         uint64_t f = b % dcols;
215         uint64_t o = (b / dcols) << unit_shift;
216         uint64_t q, r, c, bc, col, acols, coff, devidx;
217
218         q = s / (dcols - nparity);
219         r = s - q * (dcols - nparity);
220         bc = (r == 0 ? 0 : r + nparity);
221
222         acols = (q == 0 ? bc : dcols);
223
224         rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
225
226         rm->rm_cols = acols;
227         rm->rm_bigcols = bc;
228         rm->rm_asize = 0;
229         rm->rm_missingdata = 0;
230         rm->rm_missingparity = 0;
231         rm->rm_firstdatacol = nparity;
232
233         for (c = 0; c < acols; c++) {
234                 col = f + c;
235                 coff = o;
236                 if (col >= dcols) {
237                         col -= dcols;
238                         coff += 1ULL << unit_shift;
239                 }
240                 rm->rm_col[c].rc_devidx = col;
241                 rm->rm_col[c].rc_offset = coff;
242                 rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
243                 rm->rm_col[c].rc_data = NULL;
244                 rm->rm_col[c].rc_error = 0;
245                 rm->rm_col[c].rc_tried = 0;
246                 rm->rm_col[c].rc_skipped = 0;
247                 rm->rm_asize += rm->rm_col[c].rc_size;
248         }
249
250         rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
251
252         for (c = 0; c < rm->rm_firstdatacol; c++)
253                 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
254
255         rm->rm_col[c].rc_data = zio->io_data;
256
257         for (c = c + 1; c < acols; c++)
258                 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
259                     rm->rm_col[c - 1].rc_size;
260
261         /*
262          * If all data stored spans all columns, there's a danger that parity
263          * will always be on the same device and, since parity isn't read
264          * during normal operation, that that device's I/O bandwidth won't be
265          * used effectively. We therefore switch the parity every 1MB.
266          *
267          * ... at least that was, ostensibly, the theory. As a practical
268          * matter unless we juggle the parity between all devices evenly, we
269          * won't see any benefit. Further, occasional writes that aren't a
270          * multiple of the LCM of the number of children and the minimum
271          * stripe width are sufficient to avoid pessimal behavior.
272          * Unfortunately, this decision created an implicit on-disk format
273          * requirement that we need to support for all eternity, but only
274          * for single-parity RAID-Z.
275          */
276         ASSERT(rm->rm_cols >= 2);
277         ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
278
279         if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
280                 devidx = rm->rm_col[0].rc_devidx;
281                 o = rm->rm_col[0].rc_offset;
282                 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
283                 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
284                 rm->rm_col[1].rc_devidx = devidx;
285                 rm->rm_col[1].rc_offset = o;
286         }
287
288         zio->io_vsd = rm;
289         zio->io_vsd_free = vdev_raidz_map_free;
290         return (rm);
291 }
292
293 static void
294 vdev_raidz_generate_parity_p(raidz_map_t *rm)
295 {
296         uint64_t *p, *src, pcount, ccount, i;
297         int c;
298
299         pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
300
301         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
302                 src = rm->rm_col[c].rc_data;
303                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
304                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
305
306                 if (c == rm->rm_firstdatacol) {
307                         ASSERT(ccount == pcount);
308                         for (i = 0; i < ccount; i++, p++, src++) {
309                                 *p = *src;
310                         }
311                 } else {
312                         ASSERT(ccount <= pcount);
313                         for (i = 0; i < ccount; i++, p++, src++) {
314                                 *p ^= *src;
315                         }
316                 }
317         }
318 }
319
320 static void
321 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
322 {
323         uint64_t *q, *p, *src, pcount, ccount, mask, i;
324         int c;
325
326         pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
327         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
328             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
329
330         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
331                 src = rm->rm_col[c].rc_data;
332                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
333                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
334                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
335
336                 if (c == rm->rm_firstdatacol) {
337                         ASSERT(ccount == pcount || ccount == 0);
338                         for (i = 0; i < ccount; i++, p++, q++, src++) {
339                                 *q = *src;
340                                 *p = *src;
341                         }
342                         for (; i < pcount; i++, p++, q++, src++) {
343                                 *q = 0;
344                                 *p = 0;
345                         }
346                 } else {
347                         ASSERT(ccount <= pcount);
348
349                         /*
350                          * Rather than multiplying each byte individually (as
351                          * described above), we are able to handle 8 at once
352                          * by generating a mask based on the high bit in each
353                          * byte and using that to conditionally XOR in 0x1d.
354                          */
355                         for (i = 0; i < ccount; i++, p++, q++, src++) {
356                                 mask = *q & 0x8080808080808080ULL;
357                                 mask = (mask << 1) - (mask >> 7);
358                                 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
359                                     (mask & 0x1d1d1d1d1d1d1d1dULL);
360                                 *q ^= *src;
361                                 *p ^= *src;
362                         }
363
364                         /*
365                          * Treat short columns as though they are full of 0s.
366                          */
367                         for (; i < pcount; i++, q++) {
368                                 mask = *q & 0x8080808080808080ULL;
369                                 mask = (mask << 1) - (mask >> 7);
370                                 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
371                                     (mask & 0x1d1d1d1d1d1d1d1dULL);
372                         }
373                 }
374         }
375 }
376
377 static void
378 vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
379 {
380         uint64_t *dst, *src, xcount, ccount, count, i;
381         int c;
382
383         xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
384         ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
385         ASSERT(xcount > 0);
386
387         src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
388         dst = rm->rm_col[x].rc_data;
389         for (i = 0; i < xcount; i++, dst++, src++) {
390                 *dst = *src;
391         }
392
393         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
394                 src = rm->rm_col[c].rc_data;
395                 dst = rm->rm_col[x].rc_data;
396
397                 if (c == x)
398                         continue;
399
400                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
401                 count = MIN(ccount, xcount);
402
403                 for (i = 0; i < count; i++, dst++, src++) {
404                         *dst ^= *src;
405                 }
406         }
407 }
408
409 static void
410 vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
411 {
412         uint64_t *dst, *src, xcount, ccount, count, mask, i;
413         uint8_t *b;
414         int c, j, exp;
415
416         xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
417         ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
418
419         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
420                 src = rm->rm_col[c].rc_data;
421                 dst = rm->rm_col[x].rc_data;
422
423                 if (c == x)
424                         ccount = 0;
425                 else
426                         ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
427
428                 count = MIN(ccount, xcount);
429
430                 if (c == rm->rm_firstdatacol) {
431                         for (i = 0; i < count; i++, dst++, src++) {
432                                 *dst = *src;
433                         }
434                         for (; i < xcount; i++, dst++) {
435                                 *dst = 0;
436                         }
437
438                 } else {
439                         /*
440                          * For an explanation of this, see the comment in
441                          * vdev_raidz_generate_parity_pq() above.
442                          */
443                         for (i = 0; i < count; i++, dst++, src++) {
444                                 mask = *dst & 0x8080808080808080ULL;
445                                 mask = (mask << 1) - (mask >> 7);
446                                 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
447                                     (mask & 0x1d1d1d1d1d1d1d1dULL);
448                                 *dst ^= *src;
449                         }
450
451                         for (; i < xcount; i++, dst++) {
452                                 mask = *dst & 0x8080808080808080ULL;
453                                 mask = (mask << 1) - (mask >> 7);
454                                 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
455                                     (mask & 0x1d1d1d1d1d1d1d1dULL);
456                         }
457                 }
458         }
459
460         src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
461         dst = rm->rm_col[x].rc_data;
462         exp = 255 - (rm->rm_cols - 1 - x);
463
464         for (i = 0; i < xcount; i++, dst++, src++) {
465                 *dst ^= *src;
466                 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
467                         *b = vdev_raidz_exp2(*b, exp);
468                 }
469         }
470 }
471
472 static void
473 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
474 {
475         uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
476         void *pdata, *qdata;
477         uint64_t xsize, ysize, i;
478
479         ASSERT(x < y);
480         ASSERT(x >= rm->rm_firstdatacol);
481         ASSERT(y < rm->rm_cols);
482
483         ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
484
485         /*
486          * Move the parity data aside -- we're going to compute parity as
487          * though columns x and y were full of zeros -- Pxy and Qxy. We want to
488          * reuse the parity generation mechanism without trashing the actual
489          * parity so we make those columns appear to be full of zeros by
490          * setting their lengths to zero.
491          */
492         pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
493         qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
494         xsize = rm->rm_col[x].rc_size;
495         ysize = rm->rm_col[y].rc_size;
496
497         rm->rm_col[VDEV_RAIDZ_P].rc_data =
498             zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
499         rm->rm_col[VDEV_RAIDZ_Q].rc_data =
500             zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
501         rm->rm_col[x].rc_size = 0;
502         rm->rm_col[y].rc_size = 0;
503
504         vdev_raidz_generate_parity_pq(rm);
505
506         rm->rm_col[x].rc_size = xsize;
507         rm->rm_col[y].rc_size = ysize;
508
509         p = pdata;
510         q = qdata;
511         pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
512         qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
513         xd = rm->rm_col[x].rc_data;
514         yd = rm->rm_col[y].rc_data;
515
516         /*
517          * We now have:
518          *      Pxy = P + D_x + D_y
519          *      Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
520          *
521          * We can then solve for D_x:
522          *      D_x = A * (P + Pxy) + B * (Q + Qxy)
523          * where
524          *      A = 2^(x - y) * (2^(x - y) + 1)^-1
525          *      B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
526          *
527          * With D_x in hand, we can easily solve for D_y:
528          *      D_y = P + Pxy + D_x
529          */
530
531         a = vdev_raidz_pow2[255 + x - y];
532         b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
533         tmp = 255 - vdev_raidz_log2[a ^ 1];
534
535         aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
536         bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
537
538         for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
539                 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
540                     vdev_raidz_exp2(*q ^ *qxy, bexp);
541
542                 if (i < ysize)
543                         *yd = *p ^ *pxy ^ *xd;
544         }
545
546         zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
547             rm->rm_col[VDEV_RAIDZ_P].rc_size);
548         zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
549             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
550
551         /*
552          * Restore the saved parity data.
553          */
554         rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
555         rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
556 }
557
558
559 static int
560 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
561 {
562         vdev_t *cvd;
563         uint64_t nparity = vd->vdev_nparity;
564         int c, error;
565         int lasterror = 0;
566         int numerrors = 0;
567
568         ASSERT(nparity > 0);
569
570         if (nparity > VDEV_RAIDZ_MAXPARITY ||
571             vd->vdev_children < nparity + 1) {
572                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
573                 return (EINVAL);
574         }
575
576         for (c = 0; c < vd->vdev_children; c++) {
577                 cvd = vd->vdev_child[c];
578
579                 if ((error = vdev_open(cvd)) != 0) {
580                         lasterror = error;
581                         numerrors++;
582                         continue;
583                 }
584
585                 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
586                 *ashift = MAX(*ashift, cvd->vdev_ashift);
587         }
588
589         *asize *= vd->vdev_children;
590
591         if (numerrors > nparity) {
592                 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
593                 return (lasterror);
594         }
595
596         return (0);
597 }
598
599 static void
600 vdev_raidz_close(vdev_t *vd)
601 {
602         int c;
603
604         for (c = 0; c < vd->vdev_children; c++)
605                 vdev_close(vd->vdev_child[c]);
606 }
607
608 static uint64_t
609 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
610 {
611         uint64_t asize;
612         uint64_t ashift = vd->vdev_top->vdev_ashift;
613         uint64_t cols = vd->vdev_children;
614         uint64_t nparity = vd->vdev_nparity;
615
616         asize = ((psize - 1) >> ashift) + 1;
617         asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
618         asize = roundup(asize, nparity + 1) << ashift;
619
620         return (asize);
621 }
622
623 static void
624 vdev_raidz_child_done(zio_t *zio)
625 {
626         raidz_col_t *rc = zio->io_private;
627
628         rc->rc_error = zio->io_error;
629         rc->rc_tried = 1;
630         rc->rc_skipped = 0;
631 }
632
633 static int
634 vdev_raidz_io_start(zio_t *zio)
635 {
636         vdev_t *vd = zio->io_vd;
637         vdev_t *tvd = vd->vdev_top;
638         vdev_t *cvd;
639         blkptr_t *bp = zio->io_bp;
640         raidz_map_t *rm;
641         raidz_col_t *rc;
642         int c;
643
644         rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
645             vd->vdev_nparity);
646
647         ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
648
649         if (zio->io_type == ZIO_TYPE_WRITE) {
650                 /*
651                  * Generate RAID parity in the first virtual columns.
652                  */
653                 if (rm->rm_firstdatacol == 1)
654                         vdev_raidz_generate_parity_p(rm);
655                 else
656                         vdev_raidz_generate_parity_pq(rm);
657
658                 for (c = 0; c < rm->rm_cols; c++) {
659                         rc = &rm->rm_col[c];
660                         cvd = vd->vdev_child[rc->rc_devidx];
661                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
662                             rc->rc_offset, rc->rc_data, rc->rc_size,
663                             zio->io_type, zio->io_priority, 0,
664                             vdev_raidz_child_done, rc));
665                 }
666
667                 return (ZIO_PIPELINE_CONTINUE);
668         }
669
670         ASSERT(zio->io_type == ZIO_TYPE_READ);
671
672         /*
673          * Iterate over the columns in reverse order so that we hit the parity
674          * last -- any errors along the way will force us to read the parity
675          * data.
676          */
677         for (c = rm->rm_cols - 1; c >= 0; c--) {
678                 rc = &rm->rm_col[c];
679                 cvd = vd->vdev_child[rc->rc_devidx];
680                 if (!vdev_readable(cvd)) {
681                         if (c >= rm->rm_firstdatacol)
682                                 rm->rm_missingdata++;
683                         else
684                                 rm->rm_missingparity++;
685                         rc->rc_error = ENXIO;
686                         rc->rc_tried = 1;       /* don't even try */
687                         rc->rc_skipped = 1;
688                         continue;
689                 }
690                 if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
691                         if (c >= rm->rm_firstdatacol)
692                                 rm->rm_missingdata++;
693                         else
694                                 rm->rm_missingparity++;
695                         rc->rc_error = ESTALE;
696                         rc->rc_skipped = 1;
697                         continue;
698                 }
699                 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
700                     (zio->io_flags & ZIO_FLAG_SCRUB)) {
701                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
702                             rc->rc_offset, rc->rc_data, rc->rc_size,
703                             zio->io_type, zio->io_priority, 0,
704                             vdev_raidz_child_done, rc));
705                 }
706         }
707
708         return (ZIO_PIPELINE_CONTINUE);
709 }
710
711 /*
712  * Report a checksum error for a child of a RAID-Z device.
713  */
714 static void
715 raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
716 {
717         vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
718
719         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
720                 mutex_enter(&vd->vdev_stat_lock);
721                 vd->vdev_stat.vs_checksum_errors++;
722                 mutex_exit(&vd->vdev_stat_lock);
723         }
724
725         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
726                 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
727                     zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
728 }
729
730 /*
731  * Generate the parity from the data columns. If we tried and were able to
732  * read the parity without error, verify that the generated parity matches the
733  * data we read. If it doesn't, we fire off a checksum error. Return the
734  * number such failures.
735  */
736 static int
737 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
738 {
739         void *orig[VDEV_RAIDZ_MAXPARITY];
740         int c, ret = 0;
741         raidz_col_t *rc;
742
743         for (c = 0; c < rm->rm_firstdatacol; c++) {
744                 rc = &rm->rm_col[c];
745                 if (!rc->rc_tried || rc->rc_error != 0)
746                         continue;
747                 orig[c] = zio_buf_alloc(rc->rc_size);
748                 bcopy(rc->rc_data, orig[c], rc->rc_size);
749         }
750
751         if (rm->rm_firstdatacol == 1)
752                 vdev_raidz_generate_parity_p(rm);
753         else
754                 vdev_raidz_generate_parity_pq(rm);
755
756         for (c = 0; c < rm->rm_firstdatacol; c++) {
757                 rc = &rm->rm_col[c];
758                 if (!rc->rc_tried || rc->rc_error != 0)
759                         continue;
760                 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
761                         raidz_checksum_error(zio, rc);
762                         rc->rc_error = ECKSUM;
763                         ret++;
764                 }
765                 zio_buf_free(orig[c], rc->rc_size);
766         }
767
768         return (ret);
769 }
770
771 static uint64_t raidz_corrected_p;
772 static uint64_t raidz_corrected_q;
773 static uint64_t raidz_corrected_pq;
774
775 static int
776 vdev_raidz_worst_error(raidz_map_t *rm)
777 {
778         int error = 0;
779
780         for (int c = 0; c < rm->rm_cols; c++)
781                 error = zio_worst_error(error, rm->rm_col[c].rc_error);
782
783         return (error);
784 }
785
786 static void
787 vdev_raidz_io_done(zio_t *zio)
788 {
789         vdev_t *vd = zio->io_vd;
790         vdev_t *cvd;
791         raidz_map_t *rm = zio->io_vsd;
792         raidz_col_t *rc, *rc1;
793         int unexpected_errors = 0;
794         int parity_errors = 0;
795         int parity_untried = 0;
796         int data_errors = 0;
797         int total_errors = 0;
798         int n, c, c1;
799
800         ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
801
802         ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
803         ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
804
805         for (c = 0; c < rm->rm_cols; c++) {
806                 rc = &rm->rm_col[c];
807
808                 if (rc->rc_error) {
809                         ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
810
811                         if (c < rm->rm_firstdatacol)
812                                 parity_errors++;
813                         else
814                                 data_errors++;
815
816                         if (!rc->rc_skipped)
817                                 unexpected_errors++;
818
819                         total_errors++;
820                 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
821                         parity_untried++;
822                 }
823         }
824
825         if (zio->io_type == ZIO_TYPE_WRITE) {
826                 /*
827                  * XXX -- for now, treat partial writes as a success.
828                  * (If we couldn't write enough columns to reconstruct
829                  * the data, the I/O failed.  Otherwise, good enough.)
830                  *
831                  * Now that we support write reallocation, it would be better
832                  * to treat partial failure as real failure unless there are
833                  * no non-degraded top-level vdevs left, and not update DTLs
834                  * if we intend to reallocate.
835                  */
836                 /* XXPOLICY */
837                 if (total_errors > rm->rm_firstdatacol)
838                         zio->io_error = vdev_raidz_worst_error(rm);
839
840                 return;
841         }
842
843         ASSERT(zio->io_type == ZIO_TYPE_READ);
844         /*
845          * There are three potential phases for a read:
846          *      1. produce valid data from the columns read
847          *      2. read all disks and try again
848          *      3. perform combinatorial reconstruction
849          *
850          * Each phase is progressively both more expensive and less likely to
851          * occur. If we encounter more errors than we can repair or all phases
852          * fail, we have no choice but to return an error.
853          */
854
855         /*
856          * If the number of errors we saw was correctable -- less than or equal
857          * to the number of parity disks read -- attempt to produce data that
858          * has a valid checksum. Naturally, this case applies in the absence of
859          * any errors.
860          */
861         if (total_errors <= rm->rm_firstdatacol - parity_untried) {
862                 switch (data_errors) {
863                 case 0:
864                         if (zio_checksum_error(zio) == 0) {
865                                 /*
866                                  * If we read parity information (unnecessarily
867                                  * as it happens since no reconstruction was
868                                  * needed) regenerate and verify the parity.
869                                  * We also regenerate parity when resilvering
870                                  * so we can write it out to the failed device
871                                  * later.
872                                  */
873                                 if (parity_errors + parity_untried <
874                                     rm->rm_firstdatacol ||
875                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
876                                         n = raidz_parity_verify(zio, rm);
877                                         unexpected_errors += n;
878                                         ASSERT(parity_errors + n <=
879                                             rm->rm_firstdatacol);
880                                 }
881                                 goto done;
882                         }
883                         break;
884
885                 case 1:
886                         /*
887                          * We either attempt to read all the parity columns or
888                          * none of them. If we didn't try to read parity, we
889                          * wouldn't be here in the correctable case. There must
890                          * also have been fewer parity errors than parity
891                          * columns or, again, we wouldn't be in this code path.
892                          */
893                         ASSERT(parity_untried == 0);
894                         ASSERT(parity_errors < rm->rm_firstdatacol);
895
896                         /*
897                          * Find the column that reported the error.
898                          */
899                         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
900                                 rc = &rm->rm_col[c];
901                                 if (rc->rc_error != 0)
902                                         break;
903                         }
904                         ASSERT(c != rm->rm_cols);
905                         ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
906                             rc->rc_error == ESTALE);
907
908                         if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
909                                 vdev_raidz_reconstruct_p(rm, c);
910                         } else {
911                                 ASSERT(rm->rm_firstdatacol > 1);
912                                 vdev_raidz_reconstruct_q(rm, c);
913                         }
914
915                         if (zio_checksum_error(zio) == 0) {
916                                 if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
917                                         atomic_inc_64(&raidz_corrected_p);
918                                 else
919                                         atomic_inc_64(&raidz_corrected_q);
920
921                                 /*
922                                  * If there's more than one parity disk that
923                                  * was successfully read, confirm that the
924                                  * other parity disk produced the correct data.
925                                  * This routine is suboptimal in that it
926                                  * regenerates both the parity we wish to test
927                                  * as well as the parity we just used to
928                                  * perform the reconstruction, but this should
929                                  * be a relatively uncommon case, and can be
930                                  * optimized if it becomes a problem.
931                                  * We also regenerate parity when resilvering
932                                  * so we can write it out to the failed device
933                                  * later.
934                                  */
935                                 if (parity_errors < rm->rm_firstdatacol - 1 ||
936                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
937                                         n = raidz_parity_verify(zio, rm);
938                                         unexpected_errors += n;
939                                         ASSERT(parity_errors + n <=
940                                             rm->rm_firstdatacol);
941                                 }
942
943                                 goto done;
944                         }
945                         break;
946
947                 case 2:
948                         /*
949                          * Two data column errors require double parity.
950                          */
951                         ASSERT(rm->rm_firstdatacol == 2);
952
953                         /*
954                          * Find the two columns that reported errors.
955                          */
956                         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
957                                 rc = &rm->rm_col[c];
958                                 if (rc->rc_error != 0)
959                                         break;
960                         }
961                         ASSERT(c != rm->rm_cols);
962                         ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
963                             rc->rc_error == ESTALE);
964
965                         for (c1 = c++; c < rm->rm_cols; c++) {
966                                 rc = &rm->rm_col[c];
967                                 if (rc->rc_error != 0)
968                                         break;
969                         }
970                         ASSERT(c != rm->rm_cols);
971                         ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
972                             rc->rc_error == ESTALE);
973
974                         vdev_raidz_reconstruct_pq(rm, c1, c);
975
976                         if (zio_checksum_error(zio) == 0) {
977                                 atomic_inc_64(&raidz_corrected_pq);
978                                 goto done;
979                         }
980                         break;
981
982                 default:
983                         ASSERT(rm->rm_firstdatacol <= 2);
984                         ASSERT(0);
985                 }
986         }
987
988         /*
989          * This isn't a typical situation -- either we got a read error or
990          * a child silently returned bad data. Read every block so we can
991          * try again with as much data and parity as we can track down. If
992          * we've already been through once before, all children will be marked
993          * as tried so we'll proceed to combinatorial reconstruction.
994          */
995         unexpected_errors = 1;
996         rm->rm_missingdata = 0;
997         rm->rm_missingparity = 0;
998
999         for (c = 0; c < rm->rm_cols; c++) {
1000                 if (rm->rm_col[c].rc_tried)
1001                         continue;
1002
1003                 zio_vdev_io_redone(zio);
1004                 do {
1005                         rc = &rm->rm_col[c];
1006                         if (rc->rc_tried)
1007                                 continue;
1008                         zio_nowait(zio_vdev_child_io(zio, NULL,
1009                             vd->vdev_child[rc->rc_devidx],
1010                             rc->rc_offset, rc->rc_data, rc->rc_size,
1011                             zio->io_type, zio->io_priority, 0,
1012                             vdev_raidz_child_done, rc));
1013                 } while (++c < rm->rm_cols);
1014
1015                 return;
1016         }
1017
1018         /*
1019          * At this point we've attempted to reconstruct the data given the
1020          * errors we detected, and we've attempted to read all columns. There
1021          * must, therefore, be one or more additional problems -- silent errors
1022          * resulting in invalid data rather than explicit I/O errors resulting
1023          * in absent data. Before we attempt combinatorial reconstruction make
1024          * sure we have a chance of coming up with the right answer.
1025          */
1026         if (total_errors >= rm->rm_firstdatacol) {
1027                 zio->io_error = vdev_raidz_worst_error(rm);
1028                 /*
1029                  * If there were exactly as many device errors as parity
1030                  * columns, yet we couldn't reconstruct the data, then at
1031                  * least one device must have returned bad data silently.
1032                  */
1033                 if (total_errors == rm->rm_firstdatacol)
1034                         zio->io_error = zio_worst_error(zio->io_error, ECKSUM);
1035                 goto done;
1036         }
1037
1038         if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
1039                 /*
1040                  * Attempt to reconstruct the data from parity P.
1041                  */
1042                 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1043                         void *orig;
1044                         rc = &rm->rm_col[c];
1045
1046                         orig = zio_buf_alloc(rc->rc_size);
1047                         bcopy(rc->rc_data, orig, rc->rc_size);
1048                         vdev_raidz_reconstruct_p(rm, c);
1049
1050                         if (zio_checksum_error(zio) == 0) {
1051                                 zio_buf_free(orig, rc->rc_size);
1052                                 atomic_inc_64(&raidz_corrected_p);
1053
1054                                 /*
1055                                  * If this child didn't know that it returned
1056                                  * bad data, inform it.
1057                                  */
1058                                 if (rc->rc_tried && rc->rc_error == 0)
1059                                         raidz_checksum_error(zio, rc);
1060                                 rc->rc_error = ECKSUM;
1061                                 goto done;
1062                         }
1063
1064                         bcopy(orig, rc->rc_data, rc->rc_size);
1065                         zio_buf_free(orig, rc->rc_size);
1066                 }
1067         }
1068
1069         if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
1070                 /*
1071                  * Attempt to reconstruct the data from parity Q.
1072                  */
1073                 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1074                         void *orig;
1075                         rc = &rm->rm_col[c];
1076
1077                         orig = zio_buf_alloc(rc->rc_size);
1078                         bcopy(rc->rc_data, orig, rc->rc_size);
1079                         vdev_raidz_reconstruct_q(rm, c);
1080
1081                         if (zio_checksum_error(zio) == 0) {
1082                                 zio_buf_free(orig, rc->rc_size);
1083                                 atomic_inc_64(&raidz_corrected_q);
1084
1085                                 /*
1086                                  * If this child didn't know that it returned
1087                                  * bad data, inform it.
1088                                  */
1089                                 if (rc->rc_tried && rc->rc_error == 0)
1090                                         raidz_checksum_error(zio, rc);
1091                                 rc->rc_error = ECKSUM;
1092                                 goto done;
1093                         }
1094
1095                         bcopy(orig, rc->rc_data, rc->rc_size);
1096                         zio_buf_free(orig, rc->rc_size);
1097                 }
1098         }
1099
1100         if (rm->rm_firstdatacol > 1 &&
1101             rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
1102             rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
1103                 /*
1104                  * Attempt to reconstruct the data from both P and Q.
1105                  */
1106                 for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
1107                         void *orig, *orig1;
1108                         rc = &rm->rm_col[c];
1109
1110                         orig = zio_buf_alloc(rc->rc_size);
1111                         bcopy(rc->rc_data, orig, rc->rc_size);
1112
1113                         for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
1114                                 rc1 = &rm->rm_col[c1];
1115
1116                                 orig1 = zio_buf_alloc(rc1->rc_size);
1117                                 bcopy(rc1->rc_data, orig1, rc1->rc_size);
1118
1119                                 vdev_raidz_reconstruct_pq(rm, c, c1);
1120
1121                                 if (zio_checksum_error(zio) == 0) {
1122                                         zio_buf_free(orig, rc->rc_size);
1123                                         zio_buf_free(orig1, rc1->rc_size);
1124                                         atomic_inc_64(&raidz_corrected_pq);
1125
1126                                         /*
1127                                          * If these children didn't know they
1128                                          * returned bad data, inform them.
1129                                          */
1130                                         if (rc->rc_tried && rc->rc_error == 0)
1131                                                 raidz_checksum_error(zio, rc);
1132                                         if (rc1->rc_tried && rc1->rc_error == 0)
1133                                                 raidz_checksum_error(zio, rc1);
1134
1135                                         rc->rc_error = ECKSUM;
1136                                         rc1->rc_error = ECKSUM;
1137
1138                                         goto done;
1139                                 }
1140
1141                                 bcopy(orig1, rc1->rc_data, rc1->rc_size);
1142                                 zio_buf_free(orig1, rc1->rc_size);
1143                         }
1144
1145                         bcopy(orig, rc->rc_data, rc->rc_size);
1146                         zio_buf_free(orig, rc->rc_size);
1147                 }
1148         }
1149
1150         /*
1151          * All combinations failed to checksum. Generate checksum ereports for
1152          * all children.
1153          */
1154         zio->io_error = ECKSUM;
1155
1156         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1157                 for (c = 0; c < rm->rm_cols; c++) {
1158                         rc = &rm->rm_col[c];
1159                         zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
1160                             zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
1161                             rc->rc_offset, rc->rc_size);
1162                 }
1163         }
1164
1165 done:
1166         zio_checksum_verified(zio);
1167
1168         if (zio->io_error == 0 && (spa_mode & FWRITE) &&
1169             (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
1170                 /*
1171                  * Use the good data we have in hand to repair damaged children.
1172                  */
1173                 for (c = 0; c < rm->rm_cols; c++) {
1174                         rc = &rm->rm_col[c];
1175                         cvd = vd->vdev_child[rc->rc_devidx];
1176
1177                         if (rc->rc_error == 0)
1178                                 continue;
1179
1180                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1181                             rc->rc_offset, rc->rc_data, rc->rc_size,
1182                             ZIO_TYPE_WRITE, zio->io_priority,
1183                             ZIO_FLAG_IO_REPAIR, NULL, NULL));
1184                 }
1185         }
1186 }
1187
1188 static void
1189 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
1190 {
1191         if (faulted > vd->vdev_nparity)
1192                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1193                     VDEV_AUX_NO_REPLICAS);
1194         else if (degraded + faulted != 0)
1195                 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
1196         else
1197                 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
1198 }
1199
1200 vdev_ops_t vdev_raidz_ops = {
1201         vdev_raidz_open,
1202         vdev_raidz_close,
1203         vdev_raidz_asize,
1204         vdev_raidz_io_start,
1205         vdev_raidz_io_done,
1206         vdev_raidz_state_change,
1207         VDEV_TYPE_RAIDZ,        /* name of this vdev type */
1208         B_FALSE                 /* not a leaf vdev */
1209 };