]> CyberLeo.Net >> Repos - FreeBSD/releng/7.2.git/blob - sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
Create releng/7.2 from stable/7 in preparation for 7.2-RELEASE.
[FreeBSD/releng/7.2.git] / sys / cddl / contrib / opensolaris / uts / common / fs / zfs / vdev_raidz.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26
27 #pragma ident   "%Z%%M% %I%     %E% SMI"
28
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/vdev_impl.h>
32 #include <sys/zio.h>
33 #include <sys/zio_checksum.h>
34 #include <sys/fs/zfs.h>
35 #include <sys/fm/fs/zfs.h>
36
37 /*
38  * Virtual device vector for RAID-Z.
39  *
40  * This vdev supports both single and double parity. For single parity, we
41  * use a simple XOR of all the data columns. For double parity, we use both
42  * the simple XOR as well as a technique described in "The mathematics of
43  * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
44  * over the integers expressable in a single byte. Briefly, the operations on
45  * the field are defined as follows:
46  *
47  *   o addition (+) is represented by a bitwise XOR
48  *   o subtraction (-) is therefore identical to addition: A + B = A - B
49  *   o multiplication of A by 2 is defined by the following bitwise expression:
50  *      (A * 2)_7 = A_6
51  *      (A * 2)_6 = A_5
52  *      (A * 2)_5 = A_4
53  *      (A * 2)_4 = A_3 + A_7
54  *      (A * 2)_3 = A_2 + A_7
55  *      (A * 2)_2 = A_1 + A_7
56  *      (A * 2)_1 = A_0
57  *      (A * 2)_0 = A_7
58  *
59  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
60  *
61  * Observe that any number in the field (except for 0) can be expressed as a
62  * power of 2 -- a generator for the field. We store a table of the powers of
63  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
64  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
65  * than field addition). The inverse of a field element A (A^-1) is A^254.
66  *
67  * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
68  * can be expressed by field operations:
69  *
70  *      P = D_0 + D_1 + ... + D_n-2 + D_n-1
71  *      Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
72  *        = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
73  *
74  * See the reconstruction code below for how P and Q can used individually or
75  * in concert to recover missing data columns.
76  */
77
78 typedef struct raidz_col {
79         uint64_t rc_devidx;             /* child device index for I/O */
80         uint64_t rc_offset;             /* device offset */
81         uint64_t rc_size;               /* I/O size */
82         void *rc_data;                  /* I/O data */
83         int rc_error;                   /* I/O error for this device */
84         uint8_t rc_tried;               /* Did we attempt this I/O column? */
85         uint8_t rc_skipped;             /* Did we skip this I/O column? */
86 } raidz_col_t;
87
88 typedef struct raidz_map {
89         uint64_t rm_cols;               /* Column count */
90         uint64_t rm_bigcols;            /* Number of oversized columns */
91         uint64_t rm_asize;              /* Actual total I/O size */
92         uint64_t rm_missingdata;        /* Count of missing data devices */
93         uint64_t rm_missingparity;      /* Count of missing parity devices */
94         uint64_t rm_firstdatacol;       /* First data column/parity count */
95         raidz_col_t rm_col[1];          /* Flexible array of I/O columns */
96 } raidz_map_t;
97
98 #define VDEV_RAIDZ_P            0
99 #define VDEV_RAIDZ_Q            1
100
101 #define VDEV_RAIDZ_MAXPARITY    2
102
103 #define VDEV_RAIDZ_MUL_2(a)     (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
104
105 /*
106  * These two tables represent powers and logs of 2 in the Galois field defined
107  * above. These values were computed by repeatedly multiplying by 2 as above.
108  */
109 static const uint8_t vdev_raidz_pow2[256] = {
110         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
111         0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
112         0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
113         0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
114         0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
115         0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
116         0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
117         0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
118         0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
119         0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
120         0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
121         0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
122         0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
123         0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
124         0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
125         0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
126         0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
127         0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
128         0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
129         0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
130         0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
131         0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
132         0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
133         0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
134         0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
135         0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
136         0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
137         0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
138         0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
139         0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
140         0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
141         0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
142 };
143 static const uint8_t vdev_raidz_log2[256] = {
144         0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
145         0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
146         0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
147         0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
148         0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
149         0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
150         0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
151         0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
152         0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
153         0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
154         0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
155         0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
156         0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
157         0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
158         0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
159         0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
160         0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
161         0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
162         0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
163         0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
164         0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
165         0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
166         0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
167         0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
168         0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
169         0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
170         0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
171         0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
172         0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
173         0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
174         0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
175         0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
176 };
177
178 /*
179  * Multiply a given number by 2 raised to the given power.
180  */
181 static uint8_t
182 vdev_raidz_exp2(uint_t a, int exp)
183 {
184         if (a == 0)
185                 return (0);
186
187         ASSERT(exp >= 0);
188         ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
189
190         exp += vdev_raidz_log2[a];
191         if (exp > 255)
192                 exp -= 255;
193
194         return (vdev_raidz_pow2[exp]);
195 }
196
197 static raidz_map_t *
198 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
199     uint64_t nparity)
200 {
201         raidz_map_t *rm;
202         uint64_t b = zio->io_offset >> unit_shift;
203         uint64_t s = zio->io_size >> unit_shift;
204         uint64_t f = b % dcols;
205         uint64_t o = (b / dcols) << unit_shift;
206         uint64_t q, r, c, bc, col, acols, coff, devidx;
207
208         q = s / (dcols - nparity);
209         r = s - q * (dcols - nparity);
210         bc = (r == 0 ? 0 : r + nparity);
211
212         acols = (q == 0 ? bc : dcols);
213
214         rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
215
216         rm->rm_cols = acols;
217         rm->rm_bigcols = bc;
218         rm->rm_asize = 0;
219         rm->rm_missingdata = 0;
220         rm->rm_missingparity = 0;
221         rm->rm_firstdatacol = nparity;
222
223         for (c = 0; c < acols; c++) {
224                 col = f + c;
225                 coff = o;
226                 if (col >= dcols) {
227                         col -= dcols;
228                         coff += 1ULL << unit_shift;
229                 }
230                 rm->rm_col[c].rc_devidx = col;
231                 rm->rm_col[c].rc_offset = coff;
232                 rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
233                 rm->rm_col[c].rc_data = NULL;
234                 rm->rm_col[c].rc_error = 0;
235                 rm->rm_col[c].rc_tried = 0;
236                 rm->rm_col[c].rc_skipped = 0;
237                 rm->rm_asize += rm->rm_col[c].rc_size;
238         }
239
240         rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
241
242         for (c = 0; c < rm->rm_firstdatacol; c++)
243                 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
244
245         rm->rm_col[c].rc_data = zio->io_data;
246
247         for (c = c + 1; c < acols; c++)
248                 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
249                     rm->rm_col[c - 1].rc_size;
250
251         /*
252          * If all data stored spans all columns, there's a danger that parity
253          * will always be on the same device and, since parity isn't read
254          * during normal operation, that that device's I/O bandwidth won't be
255          * used effectively. We therefore switch the parity every 1MB.
256          *
257          * ... at least that was, ostensibly, the theory. As a practical
258          * matter unless we juggle the parity between all devices evenly, we
259          * won't see any benefit. Further, occasional writes that aren't a
260          * multiple of the LCM of the number of children and the minimum
261          * stripe width are sufficient to avoid pessimal behavior.
262          * Unfortunately, this decision created an implicit on-disk format
263          * requirement that we need to support for all eternity, but only
264          * for single-parity RAID-Z.
265          */
266         ASSERT(rm->rm_cols >= 2);
267         ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
268
269         if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
270                 devidx = rm->rm_col[0].rc_devidx;
271                 o = rm->rm_col[0].rc_offset;
272                 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
273                 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
274                 rm->rm_col[1].rc_devidx = devidx;
275                 rm->rm_col[1].rc_offset = o;
276         }
277
278         zio->io_vsd = rm;
279         return (rm);
280 }
281
282 static void
283 vdev_raidz_map_free(zio_t *zio)
284 {
285         raidz_map_t *rm = zio->io_vsd;
286         int c;
287
288         for (c = 0; c < rm->rm_firstdatacol; c++)
289                 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
290
291         kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
292         zio->io_vsd = NULL;
293 }
294
295 static void
296 vdev_raidz_generate_parity_p(raidz_map_t *rm)
297 {
298         uint64_t *p, *src, pcount, ccount, i;
299         int c;
300
301         pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
302
303         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
304                 src = rm->rm_col[c].rc_data;
305                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
306                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
307
308                 if (c == rm->rm_firstdatacol) {
309                         ASSERT(ccount == pcount);
310                         for (i = 0; i < ccount; i++, p++, src++) {
311                                 *p = *src;
312                         }
313                 } else {
314                         ASSERT(ccount <= pcount);
315                         for (i = 0; i < ccount; i++, p++, src++) {
316                                 *p ^= *src;
317                         }
318                 }
319         }
320 }
321
322 static void
323 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
324 {
325         uint64_t *q, *p, *src, pcount, ccount, mask, i;
326         int c;
327
328         pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
329         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
330             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
331
332         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
333                 src = rm->rm_col[c].rc_data;
334                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
335                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
336                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
337
338                 if (c == rm->rm_firstdatacol) {
339                         ASSERT(ccount == pcount || ccount == 0);
340                         for (i = 0; i < ccount; i++, p++, q++, src++) {
341                                 *q = *src;
342                                 *p = *src;
343                         }
344                         for (; i < pcount; i++, p++, q++, src++) {
345                                 *q = 0;
346                                 *p = 0;
347                         }
348                 } else {
349                         ASSERT(ccount <= pcount);
350
351                         /*
352                          * Rather than multiplying each byte individually (as
353                          * described above), we are able to handle 8 at once
354                          * by generating a mask based on the high bit in each
355                          * byte and using that to conditionally XOR in 0x1d.
356                          */
357                         for (i = 0; i < ccount; i++, p++, q++, src++) {
358                                 mask = *q & 0x8080808080808080ULL;
359                                 mask = (mask << 1) - (mask >> 7);
360                                 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
361                                     (mask & 0x1d1d1d1d1d1d1d1dULL);
362                                 *q ^= *src;
363                                 *p ^= *src;
364                         }
365
366                         /*
367                          * Treat short columns as though they are full of 0s.
368                          */
369                         for (; i < pcount; i++, q++) {
370                                 mask = *q & 0x8080808080808080ULL;
371                                 mask = (mask << 1) - (mask >> 7);
372                                 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
373                                     (mask & 0x1d1d1d1d1d1d1d1dULL);
374                         }
375                 }
376         }
377 }
378
379 static void
380 vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
381 {
382         uint64_t *dst, *src, xcount, ccount, count, i;
383         int c;
384
385         xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
386         ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
387         ASSERT(xcount > 0);
388
389         src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
390         dst = rm->rm_col[x].rc_data;
391         for (i = 0; i < xcount; i++, dst++, src++) {
392                 *dst = *src;
393         }
394
395         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
396                 src = rm->rm_col[c].rc_data;
397                 dst = rm->rm_col[x].rc_data;
398
399                 if (c == x)
400                         continue;
401
402                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
403                 count = MIN(ccount, xcount);
404
405                 for (i = 0; i < count; i++, dst++, src++) {
406                         *dst ^= *src;
407                 }
408         }
409 }
410
411 static void
412 vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
413 {
414         uint64_t *dst, *src, xcount, ccount, count, mask, i;
415         uint8_t *b;
416         int c, j, exp;
417
418         xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
419         ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
420
421         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
422                 src = rm->rm_col[c].rc_data;
423                 dst = rm->rm_col[x].rc_data;
424
425                 if (c == x)
426                         ccount = 0;
427                 else
428                         ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
429
430                 count = MIN(ccount, xcount);
431
432                 if (c == rm->rm_firstdatacol) {
433                         for (i = 0; i < count; i++, dst++, src++) {
434                                 *dst = *src;
435                         }
436                         for (; i < xcount; i++, dst++) {
437                                 *dst = 0;
438                         }
439
440                 } else {
441                         /*
442                          * For an explanation of this, see the comment in
443                          * vdev_raidz_generate_parity_pq() above.
444                          */
445                         for (i = 0; i < count; i++, dst++, src++) {
446                                 mask = *dst & 0x8080808080808080ULL;
447                                 mask = (mask << 1) - (mask >> 7);
448                                 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
449                                     (mask & 0x1d1d1d1d1d1d1d1dULL);
450                                 *dst ^= *src;
451                         }
452
453                         for (; i < xcount; i++, dst++) {
454                                 mask = *dst & 0x8080808080808080ULL;
455                                 mask = (mask << 1) - (mask >> 7);
456                                 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
457                                     (mask & 0x1d1d1d1d1d1d1d1dULL);
458                         }
459                 }
460         }
461
462         src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
463         dst = rm->rm_col[x].rc_data;
464         exp = 255 - (rm->rm_cols - 1 - x);
465
466         for (i = 0; i < xcount; i++, dst++, src++) {
467                 *dst ^= *src;
468                 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
469                         *b = vdev_raidz_exp2(*b, exp);
470                 }
471         }
472 }
473
474 static void
475 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
476 {
477         uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
478         void *pdata, *qdata;
479         uint64_t xsize, ysize, i;
480
481         ASSERT(x < y);
482         ASSERT(x >= rm->rm_firstdatacol);
483         ASSERT(y < rm->rm_cols);
484
485         ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
486
487         /*
488          * Move the parity data aside -- we're going to compute parity as
489          * though columns x and y were full of zeros -- Pxy and Qxy. We want to
490          * reuse the parity generation mechanism without trashing the actual
491          * parity so we make those columns appear to be full of zeros by
492          * setting their lengths to zero.
493          */
494         pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
495         qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
496         xsize = rm->rm_col[x].rc_size;
497         ysize = rm->rm_col[y].rc_size;
498
499         rm->rm_col[VDEV_RAIDZ_P].rc_data =
500             zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
501         rm->rm_col[VDEV_RAIDZ_Q].rc_data =
502             zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
503         rm->rm_col[x].rc_size = 0;
504         rm->rm_col[y].rc_size = 0;
505
506         vdev_raidz_generate_parity_pq(rm);
507
508         rm->rm_col[x].rc_size = xsize;
509         rm->rm_col[y].rc_size = ysize;
510
511         p = pdata;
512         q = qdata;
513         pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
514         qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
515         xd = rm->rm_col[x].rc_data;
516         yd = rm->rm_col[y].rc_data;
517
518         /*
519          * We now have:
520          *      Pxy = P + D_x + D_y
521          *      Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
522          *
523          * We can then solve for D_x:
524          *      D_x = A * (P + Pxy) + B * (Q + Qxy)
525          * where
526          *      A = 2^(x - y) * (2^(x - y) + 1)^-1
527          *      B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
528          *
529          * With D_x in hand, we can easily solve for D_y:
530          *      D_y = P + Pxy + D_x
531          */
532
533         a = vdev_raidz_pow2[255 + x - y];
534         b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
535         tmp = 255 - vdev_raidz_log2[a ^ 1];
536
537         aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
538         bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
539
540         for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
541                 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
542                     vdev_raidz_exp2(*q ^ *qxy, bexp);
543
544                 if (i < ysize)
545                         *yd = *p ^ *pxy ^ *xd;
546         }
547
548         zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
549             rm->rm_col[VDEV_RAIDZ_P].rc_size);
550         zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
551             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
552
553         /*
554          * Restore the saved parity data.
555          */
556         rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
557         rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
558 }
559
560
561 static int
562 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
563 {
564         vdev_t *cvd;
565         uint64_t nparity = vd->vdev_nparity;
566         int c, error;
567         int lasterror = 0;
568         int numerrors = 0;
569
570         ASSERT(nparity > 0);
571
572         if (nparity > VDEV_RAIDZ_MAXPARITY ||
573             vd->vdev_children < nparity + 1) {
574                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
575                 return (EINVAL);
576         }
577
578         for (c = 0; c < vd->vdev_children; c++) {
579                 cvd = vd->vdev_child[c];
580
581                 if ((error = vdev_open(cvd)) != 0) {
582                         lasterror = error;
583                         numerrors++;
584                         continue;
585                 }
586
587                 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
588                 *ashift = MAX(*ashift, cvd->vdev_ashift);
589         }
590
591         *asize *= vd->vdev_children;
592
593         if (numerrors > nparity) {
594                 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
595                 return (lasterror);
596         }
597
598         return (0);
599 }
600
601 static void
602 vdev_raidz_close(vdev_t *vd)
603 {
604         int c;
605
606         for (c = 0; c < vd->vdev_children; c++)
607                 vdev_close(vd->vdev_child[c]);
608 }
609
610 static uint64_t
611 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
612 {
613         uint64_t asize;
614         uint64_t ashift = vd->vdev_top->vdev_ashift;
615         uint64_t cols = vd->vdev_children;
616         uint64_t nparity = vd->vdev_nparity;
617
618         asize = ((psize - 1) >> ashift) + 1;
619         asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
620         asize = roundup(asize, nparity + 1) << ashift;
621
622         return (asize);
623 }
624
625 static void
626 vdev_raidz_child_done(zio_t *zio)
627 {
628         raidz_col_t *rc = zio->io_private;
629
630         rc->rc_error = zio->io_error;
631         rc->rc_tried = 1;
632         rc->rc_skipped = 0;
633 }
634
635 static void
636 vdev_raidz_repair_done(zio_t *zio)
637 {
638         ASSERT(zio->io_private == zio->io_parent);
639         vdev_raidz_map_free(zio->io_private);
640 }
641
642 static void
643 vdev_raidz_io_start(zio_t *zio)
644 {
645         vdev_t *vd = zio->io_vd;
646         vdev_t *tvd = vd->vdev_top;
647         vdev_t *cvd;
648         blkptr_t *bp = zio->io_bp;
649         raidz_map_t *rm;
650         raidz_col_t *rc;
651         int c;
652
653         rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
654             vd->vdev_nparity);
655
656         ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
657
658         if (zio->io_type == ZIO_TYPE_WRITE) {
659                 /*
660                  * Generate RAID parity in the first virtual columns.
661                  */
662                 if (rm->rm_firstdatacol == 1)
663                         vdev_raidz_generate_parity_p(rm);
664                 else
665                         vdev_raidz_generate_parity_pq(rm);
666
667                 for (c = 0; c < rm->rm_cols; c++) {
668                         rc = &rm->rm_col[c];
669                         cvd = vd->vdev_child[rc->rc_devidx];
670                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
671                             rc->rc_offset, rc->rc_data, rc->rc_size,
672                             zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
673                             vdev_raidz_child_done, rc));
674                 }
675                 zio_wait_children_done(zio);
676                 return;
677         }
678
679         ASSERT(zio->io_type == ZIO_TYPE_READ);
680
681         /*
682          * Iterate over the columns in reverse order so that we hit the parity
683          * last -- any errors along the way will force us to read the parity
684          * data.
685          */
686         for (c = rm->rm_cols - 1; c >= 0; c--) {
687                 rc = &rm->rm_col[c];
688                 cvd = vd->vdev_child[rc->rc_devidx];
689                 if (vdev_is_dead(cvd)) {
690                         if (c >= rm->rm_firstdatacol)
691                                 rm->rm_missingdata++;
692                         else
693                                 rm->rm_missingparity++;
694                         rc->rc_error = ENXIO;
695                         rc->rc_tried = 1;       /* don't even try */
696                         rc->rc_skipped = 1;
697                         continue;
698                 }
699                 if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
700                         if (c >= rm->rm_firstdatacol)
701                                 rm->rm_missingdata++;
702                         else
703                                 rm->rm_missingparity++;
704                         rc->rc_error = ESTALE;
705                         rc->rc_skipped = 1;
706                         continue;
707                 }
708                 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
709                     (zio->io_flags & ZIO_FLAG_SCRUB)) {
710                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
711                             rc->rc_offset, rc->rc_data, rc->rc_size,
712                             zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
713                             vdev_raidz_child_done, rc));
714                 }
715         }
716
717         zio_wait_children_done(zio);
718 }
719
720 /*
721  * Report a checksum error for a child of a RAID-Z device.
722  */
723 static void
724 raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
725 {
726         vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
727         dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
728             vdev_description(vd));
729
730         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
731                 mutex_enter(&vd->vdev_stat_lock);
732                 vd->vdev_stat.vs_checksum_errors++;
733                 mutex_exit(&vd->vdev_stat_lock);
734         }
735
736         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
737                 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
738                     zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
739 }
740
741 /*
742  * Generate the parity from the data columns. If we tried and were able to
743  * read the parity without error, verify that the generated parity matches the
744  * data we read. If it doesn't, we fire off a checksum error. Return the
745  * number such failures.
746  */
747 static int
748 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
749 {
750         void *orig[VDEV_RAIDZ_MAXPARITY];
751         int c, ret = 0;
752         raidz_col_t *rc;
753
754         for (c = 0; c < rm->rm_firstdatacol; c++) {
755                 rc = &rm->rm_col[c];
756                 if (!rc->rc_tried || rc->rc_error != 0)
757                         continue;
758                 orig[c] = zio_buf_alloc(rc->rc_size);
759                 bcopy(rc->rc_data, orig[c], rc->rc_size);
760         }
761
762         if (rm->rm_firstdatacol == 1)
763                 vdev_raidz_generate_parity_p(rm);
764         else
765                 vdev_raidz_generate_parity_pq(rm);
766
767         for (c = 0; c < rm->rm_firstdatacol; c++) {
768                 rc = &rm->rm_col[c];
769                 if (!rc->rc_tried || rc->rc_error != 0)
770                         continue;
771                 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
772                         raidz_checksum_error(zio, rc);
773                         rc->rc_error = ECKSUM;
774                         ret++;
775                 }
776                 zio_buf_free(orig[c], rc->rc_size);
777         }
778
779         return (ret);
780 }
781
782 static uint64_t raidz_corrected_p;
783 static uint64_t raidz_corrected_q;
784 static uint64_t raidz_corrected_pq;
785
786 static void
787 vdev_raidz_io_done(zio_t *zio)
788 {
789         vdev_t *vd = zio->io_vd;
790         vdev_t *cvd;
791         raidz_map_t *rm = zio->io_vsd;
792         raidz_col_t *rc, *rc1;
793         int unexpected_errors = 0;
794         int parity_errors = 0;
795         int parity_untried = 0;
796         int data_errors = 0;
797         int n, c, c1;
798
799         ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
800
801         zio->io_error = 0;
802         zio->io_numerrors = 0;
803
804         ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
805         ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
806
807         for (c = 0; c < rm->rm_cols; c++) {
808                 rc = &rm->rm_col[c];
809
810                 /*
811                  * We preserve any EIOs because those may be worth retrying;
812                  * whereas ECKSUM and ENXIO are more likely to be persistent.
813                  */
814                 if (rc->rc_error) {
815                         if (zio->io_error != EIO)
816                                 zio->io_error = rc->rc_error;
817
818                         if (c < rm->rm_firstdatacol)
819                                 parity_errors++;
820                         else
821                                 data_errors++;
822
823                         if (!rc->rc_skipped)
824                                 unexpected_errors++;
825
826                         zio->io_numerrors++;
827                 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
828                         parity_untried++;
829                 }
830         }
831
832         if (zio->io_type == ZIO_TYPE_WRITE) {
833                 /*
834                  * If this is not a failfast write, and we were able to
835                  * write enough columns to reconstruct the data, good enough.
836                  */
837                 /* XXPOLICY */
838                 if (zio->io_numerrors <= rm->rm_firstdatacol &&
839                     !(zio->io_flags & ZIO_FLAG_FAILFAST))
840                         zio->io_error = 0;
841
842                 vdev_raidz_map_free(zio);
843                 zio_next_stage(zio);
844                 return;
845         }
846
847         ASSERT(zio->io_type == ZIO_TYPE_READ);
848         /*
849          * There are three potential phases for a read:
850          *      1. produce valid data from the columns read
851          *      2. read all disks and try again
852          *      3. perform combinatorial reconstruction
853          *
854          * Each phase is progressively both more expensive and less likely to
855          * occur. If we encounter more errors than we can repair or all phases
856          * fail, we have no choice but to return an error.
857          */
858
859         /*
860          * If the number of errors we saw was correctable -- less than or equal
861          * to the number of parity disks read -- attempt to produce data that
862          * has a valid checksum. Naturally, this case applies in the absence of
863          * any errors.
864          */
865         if (zio->io_numerrors <= rm->rm_firstdatacol - parity_untried) {
866                 switch (data_errors) {
867                 case 0:
868                         if (zio_checksum_error(zio) == 0) {
869                                 zio->io_error = 0;
870
871                                 /*
872                                  * If we read parity information (unnecessarily
873                                  * as it happens since no reconstruction was
874                                  * needed) regenerate and verify the parity.
875                                  * We also regenerate parity when resilvering
876                                  * so we can write it out to the failed device
877                                  * later.
878                                  */
879                                 if (parity_errors + parity_untried <
880                                     rm->rm_firstdatacol ||
881                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
882                                         n = raidz_parity_verify(zio, rm);
883                                         unexpected_errors += n;
884                                         ASSERT(parity_errors + n <=
885                                             rm->rm_firstdatacol);
886                                 }
887                                 goto done;
888                         }
889                         break;
890
891                 case 1:
892                         /*
893                          * We either attempt to read all the parity columns or
894                          * none of them. If we didn't try to read parity, we
895                          * wouldn't be here in the correctable case. There must
896                          * also have been fewer parity errors than parity
897                          * columns or, again, we wouldn't be in this code path.
898                          */
899                         ASSERT(parity_untried == 0);
900                         ASSERT(parity_errors < rm->rm_firstdatacol);
901
902                         /*
903                          * Find the column that reported the error.
904                          */
905                         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
906                                 rc = &rm->rm_col[c];
907                                 if (rc->rc_error != 0)
908                                         break;
909                         }
910                         ASSERT(c != rm->rm_cols);
911                         ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
912                             rc->rc_error == ESTALE);
913
914                         if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
915                                 vdev_raidz_reconstruct_p(rm, c);
916                         } else {
917                                 ASSERT(rm->rm_firstdatacol > 1);
918                                 vdev_raidz_reconstruct_q(rm, c);
919                         }
920
921                         if (zio_checksum_error(zio) == 0) {
922                                 zio->io_error = 0;
923                                 if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
924                                         atomic_inc_64(&raidz_corrected_p);
925                                 else
926                                         atomic_inc_64(&raidz_corrected_q);
927
928                                 /*
929                                  * If there's more than one parity disk that
930                                  * was successfully read, confirm that the
931                                  * other parity disk produced the correct data.
932                                  * This routine is suboptimal in that it
933                                  * regenerates both the parity we wish to test
934                                  * as well as the parity we just used to
935                                  * perform the reconstruction, but this should
936                                  * be a relatively uncommon case, and can be
937                                  * optimized if it becomes a problem.
938                                  * We also regenerate parity when resilvering
939                                  * so we can write it out to the failed device
940                                  * later.
941                                  */
942                                 if (parity_errors < rm->rm_firstdatacol - 1 ||
943                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
944                                         n = raidz_parity_verify(zio, rm);
945                                         unexpected_errors += n;
946                                         ASSERT(parity_errors + n <=
947                                             rm->rm_firstdatacol);
948                                 }
949
950                                 goto done;
951                         }
952                         break;
953
954                 case 2:
955                         /*
956                          * Two data column errors require double parity.
957                          */
958                         ASSERT(rm->rm_firstdatacol == 2);
959
960                         /*
961                          * Find the two columns that reported errors.
962                          */
963                         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
964                                 rc = &rm->rm_col[c];
965                                 if (rc->rc_error != 0)
966                                         break;
967                         }
968                         ASSERT(c != rm->rm_cols);
969                         ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
970                             rc->rc_error == ESTALE);
971
972                         for (c1 = c++; c < rm->rm_cols; c++) {
973                                 rc = &rm->rm_col[c];
974                                 if (rc->rc_error != 0)
975                                         break;
976                         }
977                         ASSERT(c != rm->rm_cols);
978                         ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
979                             rc->rc_error == ESTALE);
980
981                         vdev_raidz_reconstruct_pq(rm, c1, c);
982
983                         if (zio_checksum_error(zio) == 0) {
984                                 zio->io_error = 0;
985                                 atomic_inc_64(&raidz_corrected_pq);
986
987                                 goto done;
988                         }
989                         break;
990
991                 default:
992                         ASSERT(rm->rm_firstdatacol <= 2);
993                         ASSERT(0);
994                 }
995         }
996
997         /*
998          * This isn't a typical situation -- either we got a read error or
999          * a child silently returned bad data. Read every block so we can
1000          * try again with as much data and parity as we can track down. If
1001          * we've already been through once before, all children will be marked
1002          * as tried so we'll proceed to combinatorial reconstruction.
1003          */
1004         unexpected_errors = 1;
1005         rm->rm_missingdata = 0;
1006         rm->rm_missingparity = 0;
1007
1008         for (c = 0; c < rm->rm_cols; c++) {
1009                 if (rm->rm_col[c].rc_tried)
1010                         continue;
1011
1012                 zio->io_error = 0;
1013                 zio_vdev_io_redone(zio);
1014                 do {
1015                         rc = &rm->rm_col[c];
1016                         if (rc->rc_tried)
1017                                 continue;
1018                         zio_nowait(zio_vdev_child_io(zio, NULL,
1019                             vd->vdev_child[rc->rc_devidx],
1020                             rc->rc_offset, rc->rc_data, rc->rc_size,
1021                             zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
1022                             vdev_raidz_child_done, rc));
1023                 } while (++c < rm->rm_cols);
1024                 dprintf("rereading\n");
1025                 zio_wait_children_done(zio);
1026                 return;
1027         }
1028
1029         /*
1030          * At this point we've attempted to reconstruct the data given the
1031          * errors we detected, and we've attempted to read all columns. There
1032          * must, therefore, be one or more additional problems -- silent errors
1033          * resulting in invalid data rather than explicit I/O errors resulting
1034          * in absent data. Before we attempt combinatorial reconstruction make
1035          * sure we have a chance of coming up with the right answer.
1036          */
1037         if (zio->io_numerrors >= rm->rm_firstdatacol) {
1038                 ASSERT(zio->io_error != 0);
1039                 goto done;
1040         }
1041
1042         if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
1043                 /*
1044                  * Attempt to reconstruct the data from parity P.
1045                  */
1046                 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1047                         void *orig;
1048                         rc = &rm->rm_col[c];
1049
1050                         orig = zio_buf_alloc(rc->rc_size);
1051                         bcopy(rc->rc_data, orig, rc->rc_size);
1052                         vdev_raidz_reconstruct_p(rm, c);
1053
1054                         if (zio_checksum_error(zio) == 0) {
1055                                 zio_buf_free(orig, rc->rc_size);
1056                                 zio->io_error = 0;
1057                                 atomic_inc_64(&raidz_corrected_p);
1058
1059                                 /*
1060                                  * If this child didn't know that it returned
1061                                  * bad data, inform it.
1062                                  */
1063                                 if (rc->rc_tried && rc->rc_error == 0)
1064                                         raidz_checksum_error(zio, rc);
1065                                 rc->rc_error = ECKSUM;
1066                                 goto done;
1067                         }
1068
1069                         bcopy(orig, rc->rc_data, rc->rc_size);
1070                         zio_buf_free(orig, rc->rc_size);
1071                 }
1072         }
1073
1074         if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
1075                 /*
1076                  * Attempt to reconstruct the data from parity Q.
1077                  */
1078                 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1079                         void *orig;
1080                         rc = &rm->rm_col[c];
1081
1082                         orig = zio_buf_alloc(rc->rc_size);
1083                         bcopy(rc->rc_data, orig, rc->rc_size);
1084                         vdev_raidz_reconstruct_q(rm, c);
1085
1086                         if (zio_checksum_error(zio) == 0) {
1087                                 zio_buf_free(orig, rc->rc_size);
1088                                 zio->io_error = 0;
1089                                 atomic_inc_64(&raidz_corrected_q);
1090
1091                                 /*
1092                                  * If this child didn't know that it returned
1093                                  * bad data, inform it.
1094                                  */
1095                                 if (rc->rc_tried && rc->rc_error == 0)
1096                                         raidz_checksum_error(zio, rc);
1097                                 rc->rc_error = ECKSUM;
1098                                 goto done;
1099                         }
1100
1101                         bcopy(orig, rc->rc_data, rc->rc_size);
1102                         zio_buf_free(orig, rc->rc_size);
1103                 }
1104         }
1105
1106         if (rm->rm_firstdatacol > 1 &&
1107             rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
1108             rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
1109                 /*
1110                  * Attempt to reconstruct the data from both P and Q.
1111                  */
1112                 for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
1113                         void *orig, *orig1;
1114                         rc = &rm->rm_col[c];
1115
1116                         orig = zio_buf_alloc(rc->rc_size);
1117                         bcopy(rc->rc_data, orig, rc->rc_size);
1118
1119                         for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
1120                                 rc1 = &rm->rm_col[c1];
1121
1122                                 orig1 = zio_buf_alloc(rc1->rc_size);
1123                                 bcopy(rc1->rc_data, orig1, rc1->rc_size);
1124
1125                                 vdev_raidz_reconstruct_pq(rm, c, c1);
1126
1127                                 if (zio_checksum_error(zio) == 0) {
1128                                         zio_buf_free(orig, rc->rc_size);
1129                                         zio_buf_free(orig1, rc1->rc_size);
1130                                         zio->io_error = 0;
1131                                         atomic_inc_64(&raidz_corrected_pq);
1132
1133                                         /*
1134                                          * If these children didn't know they
1135                                          * returned bad data, inform them.
1136                                          */
1137                                         if (rc->rc_tried && rc->rc_error == 0)
1138                                                 raidz_checksum_error(zio, rc);
1139                                         if (rc1->rc_tried && rc1->rc_error == 0)
1140                                                 raidz_checksum_error(zio, rc1);
1141
1142                                         rc->rc_error = ECKSUM;
1143                                         rc1->rc_error = ECKSUM;
1144
1145                                         goto done;
1146                                 }
1147
1148                                 bcopy(orig1, rc1->rc_data, rc1->rc_size);
1149                                 zio_buf_free(orig1, rc1->rc_size);
1150                         }
1151
1152                         bcopy(orig, rc->rc_data, rc->rc_size);
1153                         zio_buf_free(orig, rc->rc_size);
1154                 }
1155         }
1156
1157         /*
1158          * All combinations failed to checksum. Generate checksum ereports for
1159          * all children.
1160          */
1161         zio->io_error = ECKSUM;
1162         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1163                 for (c = 0; c < rm->rm_cols; c++) {
1164                         rc = &rm->rm_col[c];
1165                         zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
1166                             zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
1167                             rc->rc_offset, rc->rc_size);
1168                 }
1169         }
1170
1171 done:
1172         zio_checksum_verified(zio);
1173
1174         if (zio->io_error == 0 && (spa_mode & FWRITE) &&
1175             (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
1176                 zio_t *rio;
1177
1178                 /*
1179                  * Use the good data we have in hand to repair damaged children.
1180                  *
1181                  * We issue all repair I/Os as children of 'rio' to arrange
1182                  * that vdev_raidz_map_free(zio) will be invoked after all
1183                  * repairs complete, but before we advance to the next stage.
1184                  */
1185                 rio = zio_null(zio, zio->io_spa,
1186                     vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL);
1187
1188                 for (c = 0; c < rm->rm_cols; c++) {
1189                         rc = &rm->rm_col[c];
1190                         cvd = vd->vdev_child[rc->rc_devidx];
1191
1192                         if (rc->rc_error == 0)
1193                                 continue;
1194
1195                         dprintf("%s resilvered %s @ 0x%llx error %d\n",
1196                             vdev_description(vd),
1197                             vdev_description(cvd),
1198                             zio->io_offset, rc->rc_error);
1199
1200                         zio_nowait(zio_vdev_child_io(rio, NULL, cvd,
1201                             rc->rc_offset, rc->rc_data, rc->rc_size,
1202                             ZIO_TYPE_WRITE, zio->io_priority,
1203                             ZIO_FLAG_IO_REPAIR | ZIO_FLAG_DONT_PROPAGATE |
1204                             ZIO_FLAG_CANFAIL, NULL, NULL));
1205                 }
1206
1207                 zio_nowait(rio);
1208                 zio_wait_children_done(zio);
1209                 return;
1210         }
1211
1212         vdev_raidz_map_free(zio);
1213         zio_next_stage(zio);
1214 }
1215
1216 static void
1217 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
1218 {
1219         if (faulted > vd->vdev_nparity)
1220                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1221                     VDEV_AUX_NO_REPLICAS);
1222         else if (degraded + faulted != 0)
1223                 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
1224         else
1225                 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
1226 }
1227
1228 vdev_ops_t vdev_raidz_ops = {
1229         vdev_raidz_open,
1230         vdev_raidz_close,
1231         vdev_raidz_asize,
1232         vdev_raidz_io_start,
1233         vdev_raidz_io_done,
1234         vdev_raidz_state_change,
1235         VDEV_TYPE_RAIDZ,        /* name of this vdev type */
1236         B_FALSE                 /* not a leaf vdev */
1237 };