]> CyberLeo.Net >> Repos - FreeBSD/stable/8.git/blob - sys/cddl/boot/zfs/zfssubr.c
Copy head to stable/8 as part of 8.0 Release cycle.
[FreeBSD/stable/8.git] / sys / cddl / boot / zfs / zfssubr.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
28
29 static uint64_t zfs_crc64_table[256];
30
31 static void
32 zfs_init_crc(void)
33 {
34         int i, j;
35         uint64_t *ct;
36
37         /*
38          * Calculate the crc64 table (used for the zap hash
39          * function).
40          */
41         if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
42                 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
43                 for (i = 0; i < 256; i++)
44                         for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
45                                 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
46         }
47 }
48
49 static void
50 zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
51 {
52         ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
53 }
54
55 /*
56  * Signature for checksum functions.
57  */
58 typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
59
60 /*
61  * Information about each checksum function.
62  */
63 typedef struct zio_checksum_info {
64         zio_checksum_t  *ci_func[2]; /* checksum function for each byteorder */
65         int             ci_correctable; /* number of correctable bits   */
66         int             ci_zbt;         /* uses zio block tail? */
67         const char      *ci_name;       /* descriptive name */
68 } zio_checksum_info_t;
69
70 #include "fletcher.c"
71 #include "sha256.c"
72
73 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
74         {{NULL,                 NULL},                  0, 0,   "inherit"},
75         {{NULL,                 NULL},                  0, 0,   "on"},
76         {{zio_checksum_off,     zio_checksum_off},      0, 0,   "off"},
77         {{zio_checksum_SHA256,  NULL},                  1, 1,   "label"},
78         {{zio_checksum_SHA256,  NULL},                  1, 1,   "gang_header"},
79         {{fletcher_2_native,    NULL},                  0, 1,   "zilog"},
80         {{fletcher_2_native,    NULL},                  0, 0,   "fletcher2"},
81         {{fletcher_4_native,    NULL},                  1, 0,   "fletcher4"},
82         {{zio_checksum_SHA256,  NULL},                  1, 0,   "SHA256"},
83 };
84
85 /*
86  * Common signature for all zio compress/decompress functions.
87  */
88 typedef size_t zio_compress_func_t(void *src, void *dst,
89     size_t s_len, size_t d_len, int);
90 typedef int zio_decompress_func_t(void *src, void *dst,
91     size_t s_len, size_t d_len, int);
92
93 /*
94  * Information about each compression function.
95  */
96 typedef struct zio_compress_info {
97         zio_compress_func_t     *ci_compress;   /* compression function */
98         zio_decompress_func_t   *ci_decompress; /* decompression function */
99         int                     ci_level;       /* level parameter */
100         const char              *ci_name;       /* algorithm name */
101 } zio_compress_info_t;
102
103 #include "lzjb.c"
104
105 /*
106  * Compression vectors.
107  */
108 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
109         {NULL,                  NULL,                   0,      "inherit"},
110         {NULL,                  NULL,                   0,      "on"},
111         {NULL,                  NULL,                   0,      "uncompressed"},
112         {NULL,                  lzjb_decompress,        0,      "lzjb"},
113         {NULL,                  NULL,                   0,      "empty"},
114         {NULL,                  NULL,                   1,      "gzip-1"},
115         {NULL,                  NULL,                   2,      "gzip-2"},
116         {NULL,                  NULL,                   3,      "gzip-3"},
117         {NULL,                  NULL,                   4,      "gzip-4"},
118         {NULL,                  NULL,                   5,      "gzip-5"},
119         {NULL,                  NULL,                   6,      "gzip-6"},
120         {NULL,                  NULL,                   7,      "gzip-7"},
121         {NULL,                  NULL,                   8,      "gzip-8"},
122         {NULL,                  NULL,                   9,      "gzip-9"},
123 };
124
125 static int
126 zio_checksum_error(const blkptr_t *bp, void *data)
127 {
128         zio_cksum_t zc = bp->blk_cksum;
129         unsigned int checksum = BP_GET_CHECKSUM(bp);
130         uint64_t size = BP_GET_PSIZE(bp);
131         zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
132         zio_checksum_info_t *ci = &zio_checksum_table[checksum];
133         zio_cksum_t actual_cksum, expected_cksum;
134
135         if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
136                 return (EINVAL);
137
138         if (ci->ci_zbt) {
139                 expected_cksum = zbt->zbt_cksum;
140                 zbt->zbt_cksum = zc;
141                 ci->ci_func[0](data, size, &actual_cksum);
142                 zbt->zbt_cksum = expected_cksum;
143                 zc = expected_cksum;
144         } else {
145                 /* ASSERT(!BP_IS_GANG(bp)); */
146                 ci->ci_func[0](data, size, &actual_cksum);
147         }
148
149         if (!ZIO_CHECKSUM_EQUAL(actual_cksum, zc)) {
150                 /*printf("ZFS: read checksum failed\n");*/
151                 return (EIO);
152         }
153
154         return (0);
155 }
156
157 static int
158 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
159         void *dest, uint64_t destsize)
160 {
161         zio_compress_info_t *ci = &zio_compress_table[cpfunc];
162
163         /* ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS); */
164         if (!ci->ci_decompress) {
165                 printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
166                 return (EIO);
167         }
168
169         return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
170 }
171
172 static uint64_t
173 zap_hash(uint64_t salt, const char *name)
174 {
175         const uint8_t *cp;
176         uint8_t c;
177         uint64_t crc = salt;
178
179         /*ASSERT(crc != 0);*/
180         /*ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);*/
181         for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
182                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
183
184         /*
185          * Only use 28 bits, since we need 4 bits in the cookie for the
186          * collision differentiator.  We MUST use the high bits, since
187          * those are the onces that we first pay attention to when
188          * chosing the bucket.
189          */
190         crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
191
192         return (crc);
193 }
194
195 static char *zfs_alloc_temp(size_t sz);
196
197 typedef struct raidz_col {
198         uint64_t rc_devidx;             /* child device index for I/O */
199         uint64_t rc_offset;             /* device offset */
200         uint64_t rc_size;               /* I/O size */
201         void *rc_data;                  /* I/O data */
202         int rc_error;                   /* I/O error for this device */
203         uint8_t rc_tried;               /* Did we attempt this I/O column? */
204         uint8_t rc_skipped;             /* Did we skip this I/O column? */
205 } raidz_col_t;
206
207 #define VDEV_RAIDZ_P            0
208 #define VDEV_RAIDZ_Q            1
209
210 static void
211 vdev_raidz_reconstruct_p(raidz_col_t *cols, int nparity, int acols, int x)
212 {
213         uint64_t *dst, *src, xcount, ccount, count, i;
214         int c;
215
216         xcount = cols[x].rc_size / sizeof (src[0]);
217         //ASSERT(xcount <= cols[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
218         //ASSERT(xcount > 0);
219
220         src = cols[VDEV_RAIDZ_P].rc_data;
221         dst = cols[x].rc_data;
222         for (i = 0; i < xcount; i++, dst++, src++) {
223                 *dst = *src;
224         }
225
226         for (c = nparity; c < acols; c++) {
227                 src = cols[c].rc_data;
228                 dst = cols[x].rc_data;
229
230                 if (c == x)
231                         continue;
232
233                 ccount = cols[c].rc_size / sizeof (src[0]);
234                 count = MIN(ccount, xcount);
235
236                 for (i = 0; i < count; i++, dst++, src++) {
237                         *dst ^= *src;
238                 }
239         }
240 }
241
242 /*
243  * These two tables represent powers and logs of 2 in the Galois field defined
244  * above. These values were computed by repeatedly multiplying by 2 as above.
245  */
246 static const uint8_t vdev_raidz_pow2[256] = {
247         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
248         0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
249         0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
250         0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
251         0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
252         0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
253         0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
254         0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
255         0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
256         0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
257         0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
258         0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
259         0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
260         0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
261         0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
262         0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
263         0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
264         0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
265         0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
266         0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
267         0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
268         0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
269         0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
270         0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
271         0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
272         0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
273         0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
274         0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
275         0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
276         0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
277         0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
278         0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
279 };
280 static const uint8_t vdev_raidz_log2[256] = {
281         0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
282         0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
283         0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
284         0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
285         0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
286         0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
287         0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
288         0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
289         0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
290         0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
291         0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
292         0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
293         0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
294         0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
295         0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
296         0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
297         0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
298         0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
299         0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
300         0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
301         0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
302         0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
303         0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
304         0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
305         0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
306         0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
307         0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
308         0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
309         0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
310         0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
311         0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
312         0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
313 };
314
315 /*
316  * Multiply a given number by 2 raised to the given power.
317  */
318 static uint8_t
319 vdev_raidz_exp2(uint8_t a, int exp)
320 {
321         if (a == 0)
322                 return (0);
323
324         //ASSERT(exp >= 0);
325         //ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
326
327         exp += vdev_raidz_log2[a];
328         if (exp > 255)
329                 exp -= 255;
330
331         return (vdev_raidz_pow2[exp]);
332 }
333
334 static void
335 vdev_raidz_generate_parity_pq(raidz_col_t *cols, int nparity, int acols)
336 {
337         uint64_t *q, *p, *src, pcount, ccount, mask, i;
338         int c;
339
340         pcount = cols[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
341         //ASSERT(cols[VDEV_RAIDZ_P].rc_size == cols[VDEV_RAIDZ_Q].rc_size);
342
343         for (c = nparity; c < acols; c++) {
344                 src = cols[c].rc_data;
345                 p = cols[VDEV_RAIDZ_P].rc_data;
346                 q = cols[VDEV_RAIDZ_Q].rc_data;
347                 ccount = cols[c].rc_size / sizeof (src[0]);
348
349                 if (c == nparity) {
350                         //ASSERT(ccount == pcount || ccount == 0);
351                         for (i = 0; i < ccount; i++, p++, q++, src++) {
352                                 *q = *src;
353                                 *p = *src;
354                         }
355                         for (; i < pcount; i++, p++, q++, src++) {
356                                 *q = 0;
357                                 *p = 0;
358                         }
359                 } else {
360                         //ASSERT(ccount <= pcount);
361
362                         /*
363                          * Rather than multiplying each byte
364                          * individually (as described above), we are
365                          * able to handle 8 at once by generating a
366                          * mask based on the high bit in each byte and
367                          * using that to conditionally XOR in 0x1d.
368                          */
369                         for (i = 0; i < ccount; i++, p++, q++, src++) {
370                                 mask = *q & 0x8080808080808080ULL;
371                                 mask = (mask << 1) - (mask >> 7);
372                                 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
373                                     (mask & 0x1d1d1d1d1d1d1d1dULL);
374                                 *q ^= *src;
375                                 *p ^= *src;
376                         }
377
378                         /*
379                          * Treat short columns as though they are full of 0s.
380                          */
381                         for (; i < pcount; i++, q++) {
382                                 mask = *q & 0x8080808080808080ULL;
383                                 mask = (mask << 1) - (mask >> 7);
384                                 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
385                                     (mask & 0x1d1d1d1d1d1d1d1dULL);
386                         }
387                 }
388         }
389 }
390
391 static void
392 vdev_raidz_reconstruct_q(raidz_col_t *cols, int nparity, int acols, int x)
393 {
394         uint64_t *dst, *src, xcount, ccount, count, mask, i;
395         uint8_t *b;
396         int c, j, exp;
397
398         xcount = cols[x].rc_size / sizeof (src[0]);
399         //ASSERT(xcount <= cols[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
400
401         for (c = nparity; c < acols; c++) {
402                 src = cols[c].rc_data;
403                 dst = cols[x].rc_data;
404
405                 if (c == x)
406                         ccount = 0;
407                 else
408                         ccount = cols[c].rc_size / sizeof (src[0]);
409
410                 count = MIN(ccount, xcount);
411
412                 if (c == nparity) {
413                         for (i = 0; i < count; i++, dst++, src++) {
414                                 *dst = *src;
415                         }
416                         for (; i < xcount; i++, dst++) {
417                                 *dst = 0;
418                         }
419
420                 } else {
421                         /*
422                          * For an explanation of this, see the comment in
423                          * vdev_raidz_generate_parity_pq() above.
424                          */
425                         for (i = 0; i < count; i++, dst++, src++) {
426                                 mask = *dst & 0x8080808080808080ULL;
427                                 mask = (mask << 1) - (mask >> 7);
428                                 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
429                                     (mask & 0x1d1d1d1d1d1d1d1dULL);
430                                 *dst ^= *src;
431                         }
432
433                         for (; i < xcount; i++, dst++) {
434                                 mask = *dst & 0x8080808080808080ULL;
435                                 mask = (mask << 1) - (mask >> 7);
436                                 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
437                                     (mask & 0x1d1d1d1d1d1d1d1dULL);
438                         }
439                 }
440         }
441
442         src = cols[VDEV_RAIDZ_Q].rc_data;
443         dst = cols[x].rc_data;
444         exp = 255 - (acols - 1 - x);
445
446         for (i = 0; i < xcount; i++, dst++, src++) {
447                 *dst ^= *src;
448                 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
449                         *b = vdev_raidz_exp2(*b, exp);
450                 }
451         }
452 }
453
454
455 static void
456 vdev_raidz_reconstruct_pq(raidz_col_t *cols, int nparity, int acols,
457     int x, int y)
458 {
459         uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
460         void *pdata, *qdata;
461         uint64_t xsize, ysize, i;
462
463         //ASSERT(x < y);
464         //ASSERT(x >= nparity);
465         //ASSERT(y < acols);
466
467         //ASSERT(cols[x].rc_size >= cols[y].rc_size);
468
469         /*
470          * Move the parity data aside -- we're going to compute parity as
471          * though columns x and y were full of zeros -- Pxy and Qxy. We want to
472          * reuse the parity generation mechanism without trashing the actual
473          * parity so we make those columns appear to be full of zeros by
474          * setting their lengths to zero.
475          */
476         pdata = cols[VDEV_RAIDZ_P].rc_data;
477         qdata = cols[VDEV_RAIDZ_Q].rc_data;
478         xsize = cols[x].rc_size;
479         ysize = cols[y].rc_size;
480
481         cols[VDEV_RAIDZ_P].rc_data =
482                 zfs_alloc_temp(cols[VDEV_RAIDZ_P].rc_size);
483         cols[VDEV_RAIDZ_Q].rc_data =
484                 zfs_alloc_temp(cols[VDEV_RAIDZ_Q].rc_size);
485         cols[x].rc_size = 0;
486         cols[y].rc_size = 0;
487
488         vdev_raidz_generate_parity_pq(cols, nparity, acols);
489
490         cols[x].rc_size = xsize;
491         cols[y].rc_size = ysize;
492
493         p = pdata;
494         q = qdata;
495         pxy = cols[VDEV_RAIDZ_P].rc_data;
496         qxy = cols[VDEV_RAIDZ_Q].rc_data;
497         xd = cols[x].rc_data;
498         yd = cols[y].rc_data;
499
500         /*
501          * We now have:
502          *      Pxy = P + D_x + D_y
503          *      Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
504          *
505          * We can then solve for D_x:
506          *      D_x = A * (P + Pxy) + B * (Q + Qxy)
507          * where
508          *      A = 2^(x - y) * (2^(x - y) + 1)^-1
509          *      B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
510          *
511          * With D_x in hand, we can easily solve for D_y:
512          *      D_y = P + Pxy + D_x
513          */
514
515         a = vdev_raidz_pow2[255 + x - y];
516         b = vdev_raidz_pow2[255 - (acols - 1 - x)];
517         tmp = 255 - vdev_raidz_log2[a ^ 1];
518
519         aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
520         bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
521
522         for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
523                 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
524                     vdev_raidz_exp2(*q ^ *qxy, bexp);
525
526                 if (i < ysize)
527                         *yd = *p ^ *pxy ^ *xd;
528         }
529
530         /*
531          * Restore the saved parity data.
532          */
533         cols[VDEV_RAIDZ_P].rc_data = pdata;
534         cols[VDEV_RAIDZ_Q].rc_data = qdata;
535 }
536
537 static int
538 vdev_raidz_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
539     off_t offset, size_t bytes)
540 {
541         size_t psize = BP_GET_PSIZE(bp);
542         vdev_t *kid;
543         int unit_shift = vdev->v_ashift;
544         int dcols = vdev->v_nchildren;
545         int nparity = vdev->v_nparity;
546         int missingdata, missingparity;
547         int parity_errors, data_errors, unexpected_errors, total_errors;
548         int parity_untried;
549         uint64_t b = offset >> unit_shift;
550         uint64_t s = psize >> unit_shift;
551         uint64_t f = b % dcols;
552         uint64_t o = (b / dcols) << unit_shift;
553         int q, r, c, c1, bc, col, acols, coff, devidx, asize, n;
554         static raidz_col_t cols[16];
555         raidz_col_t *rc, *rc1;
556
557         q = s / (dcols - nparity);
558         r = s - q * (dcols - nparity);
559         bc = (r == 0 ? 0 : r + nparity);
560
561         acols = (q == 0 ? bc : dcols);
562         asize = 0;
563         
564         for (c = 0; c < acols; c++) {
565                 col = f + c;
566                 coff = o;
567                 if (col >= dcols) {
568                         col -= dcols;
569                         coff += 1ULL << unit_shift;
570                 }
571                 cols[c].rc_devidx = col;
572                 cols[c].rc_offset = coff;
573                 cols[c].rc_size = (q + (c < bc)) << unit_shift;
574                 cols[c].rc_data = NULL;
575                 cols[c].rc_error = 0;
576                 cols[c].rc_tried = 0;
577                 cols[c].rc_skipped = 0;
578                 asize += cols[c].rc_size;
579         }
580
581         asize = roundup(asize, (nparity + 1) << unit_shift);
582
583         for (c = 0; c < nparity; c++) {
584                 cols[c].rc_data = zfs_alloc_temp(cols[c].rc_size);
585         }
586
587         cols[c].rc_data = buf;
588
589         for (c = c + 1; c < acols; c++)
590                 cols[c].rc_data = (char *)cols[c - 1].rc_data +
591                     cols[c - 1].rc_size;
592
593         /*
594          * If all data stored spans all columns, there's a danger that
595          * parity will always be on the same device and, since parity
596          * isn't read during normal operation, that that device's I/O
597          * bandwidth won't be used effectively. We therefore switch
598          * the parity every 1MB.
599          *
600          * ... at least that was, ostensibly, the theory. As a
601          * practical matter unless we juggle the parity between all
602          * devices evenly, we won't see any benefit. Further,
603          * occasional writes that aren't a multiple of the LCM of the
604          * number of children and the minimum stripe width are
605          * sufficient to avoid pessimal behavior.  Unfortunately, this
606          * decision created an implicit on-disk format requirement
607          * that we need to support for all eternity, but only for
608          * single-parity RAID-Z.
609          */
610         //ASSERT(acols >= 2);
611         //ASSERT(cols[0].rc_size == cols[1].rc_size);
612
613         if (nparity == 1 && (offset & (1ULL << 20))) {
614                 devidx = cols[0].rc_devidx;
615                 o = cols[0].rc_offset;
616                 cols[0].rc_devidx = cols[1].rc_devidx;
617                 cols[0].rc_offset = cols[1].rc_offset;
618                 cols[1].rc_devidx = devidx;
619                 cols[1].rc_offset = o;
620         }
621
622         /*
623          * Iterate over the columns in reverse order so that we hit
624          * the parity last -- any errors along the way will force us
625          * to read the parity data.
626          */
627         missingdata = 0;
628         missingparity = 0;
629         for (c = acols - 1; c >= 0; c--) {
630                 rc = &cols[c];
631                 devidx = rc->rc_devidx;
632                 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink)
633                         if (kid->v_id == devidx)
634                                 break;
635                 if (kid == NULL || kid->v_state != VDEV_STATE_HEALTHY) {
636                         if (c >= nparity)
637                                 missingdata++;
638                         else
639                                 missingparity++;
640                         rc->rc_error = ENXIO;
641                         rc->rc_tried = 1;       /* don't even try */
642                         rc->rc_skipped = 1;
643                         continue;
644                 }
645 #if 0
646                 /*
647                  * Too hard for the bootcode
648                  */
649                 if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
650                         if (c >= nparity)
651                                 rm->rm_missingdata++;
652                         else
653                                 rm->rm_missingparity++;
654                         rc->rc_error = ESTALE;
655                         rc->rc_skipped = 1;
656                         continue;
657                 }
658 #endif
659                 if (c >= nparity || missingdata > 0) {
660                         if (rc->rc_data)
661                                 rc->rc_error = kid->v_read(kid, NULL,
662                                     rc->rc_data, rc->rc_offset, rc->rc_size);
663                         else
664                                 rc->rc_error = ENXIO;
665                         rc->rc_tried = 1;
666                         rc->rc_skipped = 0;
667                 }
668         }
669
670 reconstruct:
671         parity_errors = 0;
672         data_errors = 0;
673         unexpected_errors = 0;
674         total_errors = 0;
675         parity_untried = 0;
676         for (c = 0; c < acols; c++) {
677                 rc = &cols[c];
678
679                 if (rc->rc_error) {
680                         if (c < nparity)
681                                 parity_errors++;
682                         else
683                                 data_errors++;
684
685                         if (!rc->rc_skipped)
686                                 unexpected_errors++;
687
688                         total_errors++;
689                 } else if (c < nparity && !rc->rc_tried) {
690                         parity_untried++;
691                 }
692         }
693
694         /*
695          * There are three potential phases for a read:
696          *      1. produce valid data from the columns read
697          *      2. read all disks and try again
698          *      3. perform combinatorial reconstruction
699          *
700          * Each phase is progressively both more expensive and less
701          * likely to occur. If we encounter more errors than we can
702          * repair or all phases fail, we have no choice but to return
703          * an error.
704          */
705
706         /*
707          * If the number of errors we saw was correctable -- less than
708          * or equal to the number of parity disks read -- attempt to
709          * produce data that has a valid checksum. Naturally, this
710          * case applies in the absence of any errors.
711          */
712         if (total_errors <= nparity - parity_untried) {
713                 switch (data_errors) {
714                 case 0:
715                         if (zio_checksum_error(bp, buf) == 0)
716                                 return (0);
717                         break;
718
719                 case 1:
720                         /*
721                          * We either attempt to read all the parity columns or
722                          * none of them. If we didn't try to read parity, we
723                          * wouldn't be here in the correctable case. There must
724                          * also have been fewer parity errors than parity
725                          * columns or, again, we wouldn't be in this code path.
726                          */
727                         //ASSERT(parity_untried == 0);
728                         //ASSERT(parity_errors < nparity);
729
730                         /*
731                          * Find the column that reported the error.
732                          */
733                         for (c = nparity; c < acols; c++) {
734                                 rc = &cols[c];
735                                 if (rc->rc_error != 0)
736                                         break;
737                         }
738                         //ASSERT(c != acols);
739                         //ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
740
741                         if (cols[VDEV_RAIDZ_P].rc_error == 0) {
742                                 vdev_raidz_reconstruct_p(cols, nparity,
743                                     acols, c);
744                         } else {
745                                 //ASSERT(nparity > 1);
746                                 vdev_raidz_reconstruct_q(cols, nparity,
747                                     acols, c);
748                         }
749
750                         if (zio_checksum_error(bp, buf) == 0)
751                                 return (0);
752                         break;
753
754                 case 2:
755                         /*
756                          * Two data column errors require double parity.
757                          */
758                         //ASSERT(nparity == 2);
759
760                         /*
761                          * Find the two columns that reported errors.
762                          */
763                         for (c = nparity; c < acols; c++) {
764                                 rc = &cols[c];
765                                 if (rc->rc_error != 0)
766                                         break;
767                         }
768                         //ASSERT(c != acols);
769                         //ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
770
771                         for (c1 = c++; c < acols; c++) {
772                                 rc = &cols[c];
773                                 if (rc->rc_error != 0)
774                                         break;
775                         }
776                         //ASSERT(c != acols);
777                         //ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
778
779                         vdev_raidz_reconstruct_pq(cols, nparity, acols,
780                             c1, c);
781
782                         if (zio_checksum_error(bp, buf) == 0)
783                                 return (0);
784                         break;
785
786                 default:
787                         break;
788                         //ASSERT(nparity <= 2);
789                         //ASSERT(0);
790                 }
791         }
792
793         /*
794          * This isn't a typical situation -- either we got a read
795          * error or a child silently returned bad data. Read every
796          * block so we can try again with as much data and parity as
797          * we can track down. If we've already been through once
798          * before, all children will be marked as tried so we'll
799          * proceed to combinatorial reconstruction.
800          */
801         n = 0;
802         for (c = 0; c < acols; c++) {
803                 rc = &cols[c];
804                 if (rc->rc_tried)
805                         continue;
806
807                 devidx = rc->rc_devidx;
808                 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink)
809                         if (kid->v_id == devidx)
810                                 break;
811                 if (kid == NULL || kid->v_state != VDEV_STATE_HEALTHY) {
812                         rc->rc_error = ENXIO;
813                         rc->rc_tried = 1;       /* don't even try */
814                         rc->rc_skipped = 1;
815                         continue;
816                 }
817                 if (rc->rc_data)
818                         rc->rc_error = kid->v_read(kid, NULL,
819                             rc->rc_data, rc->rc_offset, rc->rc_size);
820                 else
821                         rc->rc_error = ENXIO;
822                 if (rc->rc_error == 0)
823                         n++;
824                 rc->rc_tried = 1;
825                 rc->rc_skipped = 0;
826         }
827
828         /*
829          * If we managed to read anything more, retry the
830          * reconstruction.
831          */
832         if (n)
833                 goto reconstruct;
834
835         /*
836          * At this point we've attempted to reconstruct the data given the
837          * errors we detected, and we've attempted to read all columns. There
838          * must, therefore, be one or more additional problems -- silent errors
839          * resulting in invalid data rather than explicit I/O errors resulting
840          * in absent data. Before we attempt combinatorial reconstruction make
841          * sure we have a chance of coming up with the right answer.
842          */
843         if (total_errors >= nparity) {
844                 return (EIO);
845         }
846
847         asize = 0;
848         for (c = 0; c < acols; c++) {
849                 rc = &cols[c];
850                 if (rc->rc_size > asize)
851                         asize = rc->rc_size;
852         }
853         if (cols[VDEV_RAIDZ_P].rc_error == 0) {
854                 /*
855                  * Attempt to reconstruct the data from parity P.
856                  */
857                 void *orig;
858                 orig = zfs_alloc_temp(asize);
859                 for (c = nparity; c < acols; c++) {
860                         rc = &cols[c];
861
862                         memcpy(orig, rc->rc_data, rc->rc_size);
863                         vdev_raidz_reconstruct_p(cols, nparity, acols, c);
864
865                         if (zio_checksum_error(bp, buf) == 0)
866                                 return (0);
867
868                         memcpy(rc->rc_data, orig, rc->rc_size);
869                 }
870         }
871
872         if (nparity > 1 && cols[VDEV_RAIDZ_Q].rc_error == 0) {
873                 /*
874                  * Attempt to reconstruct the data from parity Q.
875                  */
876                 void *orig;
877                 orig = zfs_alloc_temp(asize);
878                 for (c = nparity; c < acols; c++) {
879                         rc = &cols[c];
880
881                         memcpy(orig, rc->rc_data, rc->rc_size);
882                         vdev_raidz_reconstruct_q(cols, nparity, acols, c);
883
884                         if (zio_checksum_error(bp, buf) == 0)
885                                 return (0);
886
887                         memcpy(rc->rc_data, orig, rc->rc_size);
888                 }
889         }
890
891         if (nparity > 1 &&
892             cols[VDEV_RAIDZ_P].rc_error == 0 &&
893             cols[VDEV_RAIDZ_Q].rc_error == 0) {
894                 /*
895                  * Attempt to reconstruct the data from both P and Q.
896                  */
897                 void *orig, *orig1;
898                 orig = zfs_alloc_temp(asize);
899                 orig1 = zfs_alloc_temp(asize);
900                 for (c = nparity; c < acols - 1; c++) {
901                         rc = &cols[c];
902
903                         memcpy(orig, rc->rc_data, rc->rc_size);
904
905                         for (c1 = c + 1; c1 < acols; c1++) {
906                                 rc1 = &cols[c1];
907
908                                 memcpy(orig1, rc1->rc_data, rc1->rc_size);
909
910                                 vdev_raidz_reconstruct_pq(cols, nparity,
911                                     acols, c, c1);
912
913                                 if (zio_checksum_error(bp, buf) == 0)
914                                         return (0);
915
916                                 memcpy(rc1->rc_data, orig1, rc1->rc_size);
917                         }
918
919                         memcpy(rc->rc_data, orig, rc->rc_size);
920                 }
921         }
922
923         return (EIO);
924 }
925