]> CyberLeo.Net >> Repos - FreeBSD/releng/8.0.git/blob - sys/cddl/boot/zfs/zfssubr.c
Adjust to reflect 8.0-RELEASE.
[FreeBSD/releng/8.0.git] / sys / cddl / boot / zfs / zfssubr.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
28
29 static uint64_t zfs_crc64_table[256];
30
31 static void
32 zfs_init_crc(void)
33 {
34         int i, j;
35         uint64_t *ct;
36
37         /*
38          * Calculate the crc64 table (used for the zap hash
39          * function).
40          */
41         if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
42                 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
43                 for (i = 0; i < 256; i++)
44                         for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
45                                 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
46         }
47 }
48
49 static void
50 zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
51 {
52         ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
53 }
54
55 /*
56  * Signature for checksum functions.
57  */
58 typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
59
60 /*
61  * Information about each checksum function.
62  */
63 typedef struct zio_checksum_info {
64         zio_checksum_t  *ci_func[2]; /* checksum function for each byteorder */
65         int             ci_correctable; /* number of correctable bits   */
66         int             ci_zbt;         /* uses zio block tail? */
67         const char      *ci_name;       /* descriptive name */
68 } zio_checksum_info_t;
69
70 #include "fletcher.c"
71 #include "sha256.c"
72
73 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
74         {{NULL,                 NULL},                  0, 0,   "inherit"},
75         {{NULL,                 NULL},                  0, 0,   "on"},
76         {{zio_checksum_off,     zio_checksum_off},      0, 0,   "off"},
77         {{zio_checksum_SHA256,  NULL},                  1, 1,   "label"},
78         {{zio_checksum_SHA256,  NULL},                  1, 1,   "gang_header"},
79         {{fletcher_2_native,    NULL},                  0, 1,   "zilog"},
80         {{fletcher_2_native,    NULL},                  0, 0,   "fletcher2"},
81         {{fletcher_4_native,    NULL},                  1, 0,   "fletcher4"},
82         {{zio_checksum_SHA256,  NULL},                  1, 0,   "SHA256"},
83 };
84
85 /*
86  * Common signature for all zio compress/decompress functions.
87  */
88 typedef size_t zio_compress_func_t(void *src, void *dst,
89     size_t s_len, size_t d_len, int);
90 typedef int zio_decompress_func_t(void *src, void *dst,
91     size_t s_len, size_t d_len, int);
92
93 /*
94  * Information about each compression function.
95  */
96 typedef struct zio_compress_info {
97         zio_compress_func_t     *ci_compress;   /* compression function */
98         zio_decompress_func_t   *ci_decompress; /* decompression function */
99         int                     ci_level;       /* level parameter */
100         const char              *ci_name;       /* algorithm name */
101 } zio_compress_info_t;
102
103 #include "lzjb.c"
104
105 /*
106  * Compression vectors.
107  */
108 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
109         {NULL,                  NULL,                   0,      "inherit"},
110         {NULL,                  NULL,                   0,      "on"},
111         {NULL,                  NULL,                   0,      "uncompressed"},
112         {NULL,                  lzjb_decompress,        0,      "lzjb"},
113         {NULL,                  NULL,                   0,      "empty"},
114         {NULL,                  NULL,                   1,      "gzip-1"},
115         {NULL,                  NULL,                   2,      "gzip-2"},
116         {NULL,                  NULL,                   3,      "gzip-3"},
117         {NULL,                  NULL,                   4,      "gzip-4"},
118         {NULL,                  NULL,                   5,      "gzip-5"},
119         {NULL,                  NULL,                   6,      "gzip-6"},
120         {NULL,                  NULL,                   7,      "gzip-7"},
121         {NULL,                  NULL,                   8,      "gzip-8"},
122         {NULL,                  NULL,                   9,      "gzip-9"},
123 };
124
125 static int
126 zio_checksum_error(const blkptr_t *bp, void *data)
127 {
128         zio_cksum_t zc = bp->blk_cksum;
129         unsigned int checksum = BP_GET_CHECKSUM(bp);
130         uint64_t size = BP_GET_PSIZE(bp);
131         zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
132         zio_checksum_info_t *ci = &zio_checksum_table[checksum];
133         zio_cksum_t actual_cksum, expected_cksum;
134
135         if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
136                 return (EINVAL);
137
138         if (ci->ci_zbt) {
139                 expected_cksum = zbt->zbt_cksum;
140                 zbt->zbt_cksum = zc;
141                 ci->ci_func[0](data, size, &actual_cksum);
142                 zbt->zbt_cksum = expected_cksum;
143                 zc = expected_cksum;
144         } else {
145                 /* ASSERT(!BP_IS_GANG(bp)); */
146                 ci->ci_func[0](data, size, &actual_cksum);
147         }
148
149         if (!ZIO_CHECKSUM_EQUAL(actual_cksum, zc)) {
150                 /*printf("ZFS: read checksum failed\n");*/
151                 return (EIO);
152         }
153
154         return (0);
155 }
156
157 static int
158 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
159         void *dest, uint64_t destsize)
160 {
161         zio_compress_info_t *ci = &zio_compress_table[cpfunc];
162
163         /* ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS); */
164         if (!ci->ci_decompress) {
165                 printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
166                 return (EIO);
167         }
168
169         return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
170 }
171
172 static uint64_t
173 zap_hash(uint64_t salt, const char *name)
174 {
175         const uint8_t *cp;
176         uint8_t c;
177         uint64_t crc = salt;
178
179         /*ASSERT(crc != 0);*/
180         /*ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);*/
181         for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
182                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
183
184         /*
185          * Only use 28 bits, since we need 4 bits in the cookie for the
186          * collision differentiator.  We MUST use the high bits, since
187          * those are the onces that we first pay attention to when
188          * chosing the bucket.
189          */
190         crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
191
192         return (crc);
193 }
194
195 static char *zfs_alloc_temp(size_t sz);
196
197 typedef struct raidz_col {
198         uint64_t rc_devidx;             /* child device index for I/O */
199         uint64_t rc_offset;             /* device offset */
200         uint64_t rc_size;               /* I/O size */
201         void *rc_data;                  /* I/O data */
202         int rc_error;                   /* I/O error for this device */
203         uint8_t rc_tried;               /* Did we attempt this I/O column? */
204         uint8_t rc_skipped;             /* Did we skip this I/O column? */
205 } raidz_col_t;
206
207 #define VDEV_RAIDZ_P            0
208 #define VDEV_RAIDZ_Q            1
209
210 static void
211 vdev_raidz_reconstruct_p(raidz_col_t *cols, int nparity, int acols, int x)
212 {
213         uint64_t *dst, *src, xcount, ccount, count, i;
214         int c;
215
216         xcount = cols[x].rc_size / sizeof (src[0]);
217         //ASSERT(xcount <= cols[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
218         //ASSERT(xcount > 0);
219
220         src = cols[VDEV_RAIDZ_P].rc_data;
221         dst = cols[x].rc_data;
222         for (i = 0; i < xcount; i++, dst++, src++) {
223                 *dst = *src;
224         }
225
226         for (c = nparity; c < acols; c++) {
227                 src = cols[c].rc_data;
228                 dst = cols[x].rc_data;
229
230                 if (c == x)
231                         continue;
232
233                 ccount = cols[c].rc_size / sizeof (src[0]);
234                 count = MIN(ccount, xcount);
235
236                 for (i = 0; i < count; i++, dst++, src++) {
237                         *dst ^= *src;
238                 }
239         }
240 }
241
242 /*
243  * These two tables represent powers and logs of 2 in the Galois field defined
244  * above. These values were computed by repeatedly multiplying by 2 as above.
245  */
246 static const uint8_t vdev_raidz_pow2[256] = {
247         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
248         0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
249         0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
250         0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
251         0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
252         0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
253         0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
254         0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
255         0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
256         0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
257         0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
258         0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
259         0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
260         0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
261         0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
262         0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
263         0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
264         0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
265         0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
266         0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
267         0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
268         0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
269         0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
270         0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
271         0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
272         0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
273         0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
274         0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
275         0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
276         0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
277         0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
278         0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
279 };
280 static const uint8_t vdev_raidz_log2[256] = {
281         0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
282         0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
283         0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
284         0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
285         0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
286         0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
287         0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
288         0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
289         0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
290         0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
291         0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
292         0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
293         0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
294         0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
295         0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
296         0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
297         0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
298         0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
299         0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
300         0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
301         0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
302         0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
303         0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
304         0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
305         0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
306         0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
307         0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
308         0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
309         0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
310         0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
311         0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
312         0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
313 };
314
315 /*
316  * Multiply a given number by 2 raised to the given power.
317  */
318 static uint8_t
319 vdev_raidz_exp2(uint8_t a, int exp)
320 {
321         if (a == 0)
322                 return (0);
323
324         //ASSERT(exp >= 0);
325         //ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
326
327         exp += vdev_raidz_log2[a];
328         if (exp > 255)
329                 exp -= 255;
330
331         return (vdev_raidz_pow2[exp]);
332 }
333
334 static void
335 vdev_raidz_generate_parity_pq(raidz_col_t *cols, int nparity, int acols)
336 {
337         uint64_t *q, *p, *src, pcount, ccount, mask, i;
338         int c;
339
340         pcount = cols[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
341         //ASSERT(cols[VDEV_RAIDZ_P].rc_size == cols[VDEV_RAIDZ_Q].rc_size);
342
343         for (c = nparity; c < acols; c++) {
344                 src = cols[c].rc_data;
345                 p = cols[VDEV_RAIDZ_P].rc_data;
346                 q = cols[VDEV_RAIDZ_Q].rc_data;
347                 ccount = cols[c].rc_size / sizeof (src[0]);
348
349                 if (c == nparity) {
350                         //ASSERT(ccount == pcount || ccount == 0);
351                         for (i = 0; i < ccount; i++, p++, q++, src++) {
352                                 *q = *src;
353                                 *p = *src;
354                         }
355                         for (; i < pcount; i++, p++, q++, src++) {
356                                 *q = 0;
357                                 *p = 0;
358                         }
359                 } else {
360                         //ASSERT(ccount <= pcount);
361
362                         /*
363                          * Rather than multiplying each byte
364                          * individually (as described above), we are
365                          * able to handle 8 at once by generating a
366                          * mask based on the high bit in each byte and
367                          * using that to conditionally XOR in 0x1d.
368                          */
369                         for (i = 0; i < ccount; i++, p++, q++, src++) {
370                                 mask = *q & 0x8080808080808080ULL;
371                                 mask = (mask << 1) - (mask >> 7);
372                                 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
373                                     (mask & 0x1d1d1d1d1d1d1d1dULL);
374                                 *q ^= *src;
375                                 *p ^= *src;
376                         }
377
378                         /*
379                          * Treat short columns as though they are full of 0s.
380                          */
381                         for (; i < pcount; i++, q++) {
382                                 mask = *q & 0x8080808080808080ULL;
383                                 mask = (mask << 1) - (mask >> 7);
384                                 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
385                                     (mask & 0x1d1d1d1d1d1d1d1dULL);
386                         }
387                 }
388         }
389 }
390
391 static void
392 vdev_raidz_reconstruct_q(raidz_col_t *cols, int nparity, int acols, int x)
393 {
394         uint64_t *dst, *src, xcount, ccount, count, mask, i;
395         uint8_t *b;
396         int c, j, exp;
397
398         xcount = cols[x].rc_size / sizeof (src[0]);
399         //ASSERT(xcount <= cols[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
400
401         for (c = nparity; c < acols; c++) {
402                 src = cols[c].rc_data;
403                 dst = cols[x].rc_data;
404
405                 if (c == x)
406                         ccount = 0;
407                 else
408                         ccount = cols[c].rc_size / sizeof (src[0]);
409
410                 count = MIN(ccount, xcount);
411
412                 if (c == nparity) {
413                         for (i = 0; i < count; i++, dst++, src++) {
414                                 *dst = *src;
415                         }
416                         for (; i < xcount; i++, dst++) {
417                                 *dst = 0;
418                         }
419
420                 } else {
421                         /*
422                          * For an explanation of this, see the comment in
423                          * vdev_raidz_generate_parity_pq() above.
424                          */
425                         for (i = 0; i < count; i++, dst++, src++) {
426                                 mask = *dst & 0x8080808080808080ULL;
427                                 mask = (mask << 1) - (mask >> 7);
428                                 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
429                                     (mask & 0x1d1d1d1d1d1d1d1dULL);
430                                 *dst ^= *src;
431                         }
432
433                         for (; i < xcount; i++, dst++) {
434                                 mask = *dst & 0x8080808080808080ULL;
435                                 mask = (mask << 1) - (mask >> 7);
436                                 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
437                                     (mask & 0x1d1d1d1d1d1d1d1dULL);
438                         }
439                 }
440         }
441
442         src = cols[VDEV_RAIDZ_Q].rc_data;
443         dst = cols[x].rc_data;
444         exp = 255 - (acols - 1 - x);
445
446         for (i = 0; i < xcount; i++, dst++, src++) {
447                 *dst ^= *src;
448                 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
449                         *b = vdev_raidz_exp2(*b, exp);
450                 }
451         }
452 }
453
454
455 static void
456 vdev_raidz_reconstruct_pq(raidz_col_t *cols, int nparity, int acols,
457     int x, int y)
458 {
459         uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
460         void *pdata, *qdata;
461         uint64_t xsize, ysize, i;
462
463         //ASSERT(x < y);
464         //ASSERT(x >= nparity);
465         //ASSERT(y < acols);
466
467         //ASSERT(cols[x].rc_size >= cols[y].rc_size);
468
469         /*
470          * Move the parity data aside -- we're going to compute parity as
471          * though columns x and y were full of zeros -- Pxy and Qxy. We want to
472          * reuse the parity generation mechanism without trashing the actual
473          * parity so we make those columns appear to be full of zeros by
474          * setting their lengths to zero.
475          */
476         pdata = cols[VDEV_RAIDZ_P].rc_data;
477         qdata = cols[VDEV_RAIDZ_Q].rc_data;
478         xsize = cols[x].rc_size;
479         ysize = cols[y].rc_size;
480
481         cols[VDEV_RAIDZ_P].rc_data =
482                 zfs_alloc_temp(cols[VDEV_RAIDZ_P].rc_size);
483         cols[VDEV_RAIDZ_Q].rc_data =
484                 zfs_alloc_temp(cols[VDEV_RAIDZ_Q].rc_size);
485         cols[x].rc_size = 0;
486         cols[y].rc_size = 0;
487
488         vdev_raidz_generate_parity_pq(cols, nparity, acols);
489
490         cols[x].rc_size = xsize;
491         cols[y].rc_size = ysize;
492
493         p = pdata;
494         q = qdata;
495         pxy = cols[VDEV_RAIDZ_P].rc_data;
496         qxy = cols[VDEV_RAIDZ_Q].rc_data;
497         xd = cols[x].rc_data;
498         yd = cols[y].rc_data;
499
500         /*
501          * We now have:
502          *      Pxy = P + D_x + D_y
503          *      Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
504          *
505          * We can then solve for D_x:
506          *      D_x = A * (P + Pxy) + B * (Q + Qxy)
507          * where
508          *      A = 2^(x - y) * (2^(x - y) + 1)^-1
509          *      B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
510          *
511          * With D_x in hand, we can easily solve for D_y:
512          *      D_y = P + Pxy + D_x
513          */
514
515         a = vdev_raidz_pow2[255 + x - y];
516         b = vdev_raidz_pow2[255 - (acols - 1 - x)];
517         tmp = 255 - vdev_raidz_log2[a ^ 1];
518
519         aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
520         bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
521
522         for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
523                 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
524                     vdev_raidz_exp2(*q ^ *qxy, bexp);
525
526                 if (i < ysize)
527                         *yd = *p ^ *pxy ^ *xd;
528         }
529
530         /*
531          * Restore the saved parity data.
532          */
533         cols[VDEV_RAIDZ_P].rc_data = pdata;
534         cols[VDEV_RAIDZ_Q].rc_data = qdata;
535 }
536
537 static int
538 vdev_raidz_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
539     off_t offset, size_t bytes)
540 {
541         size_t psize = BP_GET_PSIZE(bp);
542         vdev_t *kid;
543         int unit_shift = vdev->v_ashift;
544         int dcols = vdev->v_nchildren;
545         int nparity = vdev->v_nparity;
546         int missingdata, missingparity;
547         int parity_errors, data_errors, unexpected_errors, total_errors;
548         int parity_untried;
549         uint64_t b = offset >> unit_shift;
550         uint64_t s = psize >> unit_shift;
551         uint64_t f = b % dcols;
552         uint64_t o = (b / dcols) << unit_shift;
553         uint64_t q, r, coff;
554         int c, c1, bc, col, acols, devidx, asize, n;
555         static raidz_col_t cols[16];
556         raidz_col_t *rc, *rc1;
557
558         q = s / (dcols - nparity);
559         r = s - q * (dcols - nparity);
560         bc = (r == 0 ? 0 : r + nparity);
561
562         acols = (q == 0 ? bc : dcols);
563         asize = 0;
564         
565         for (c = 0; c < acols; c++) {
566                 col = f + c;
567                 coff = o;
568                 if (col >= dcols) {
569                         col -= dcols;
570                         coff += 1ULL << unit_shift;
571                 }
572                 cols[c].rc_devidx = col;
573                 cols[c].rc_offset = coff;
574                 cols[c].rc_size = (q + (c < bc)) << unit_shift;
575                 cols[c].rc_data = NULL;
576                 cols[c].rc_error = 0;
577                 cols[c].rc_tried = 0;
578                 cols[c].rc_skipped = 0;
579                 asize += cols[c].rc_size;
580         }
581
582         asize = roundup(asize, (nparity + 1) << unit_shift);
583
584         for (c = 0; c < nparity; c++) {
585                 cols[c].rc_data = zfs_alloc_temp(cols[c].rc_size);
586         }
587
588         cols[c].rc_data = buf;
589
590         for (c = c + 1; c < acols; c++)
591                 cols[c].rc_data = (char *)cols[c - 1].rc_data +
592                     cols[c - 1].rc_size;
593
594         /*
595          * If all data stored spans all columns, there's a danger that
596          * parity will always be on the same device and, since parity
597          * isn't read during normal operation, that that device's I/O
598          * bandwidth won't be used effectively. We therefore switch
599          * the parity every 1MB.
600          *
601          * ... at least that was, ostensibly, the theory. As a
602          * practical matter unless we juggle the parity between all
603          * devices evenly, we won't see any benefit. Further,
604          * occasional writes that aren't a multiple of the LCM of the
605          * number of children and the minimum stripe width are
606          * sufficient to avoid pessimal behavior.  Unfortunately, this
607          * decision created an implicit on-disk format requirement
608          * that we need to support for all eternity, but only for
609          * single-parity RAID-Z.
610          */
611         //ASSERT(acols >= 2);
612         //ASSERT(cols[0].rc_size == cols[1].rc_size);
613
614         if (nparity == 1 && (offset & (1ULL << 20))) {
615                 devidx = cols[0].rc_devidx;
616                 o = cols[0].rc_offset;
617                 cols[0].rc_devidx = cols[1].rc_devidx;
618                 cols[0].rc_offset = cols[1].rc_offset;
619                 cols[1].rc_devidx = devidx;
620                 cols[1].rc_offset = o;
621         }
622
623         /*
624          * Iterate over the columns in reverse order so that we hit
625          * the parity last -- any errors along the way will force us
626          * to read the parity data.
627          */
628         missingdata = 0;
629         missingparity = 0;
630         for (c = acols - 1; c >= 0; c--) {
631                 rc = &cols[c];
632                 devidx = rc->rc_devidx;
633                 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink)
634                         if (kid->v_id == devidx)
635                                 break;
636                 if (kid == NULL || kid->v_state != VDEV_STATE_HEALTHY) {
637                         if (c >= nparity)
638                                 missingdata++;
639                         else
640                                 missingparity++;
641                         rc->rc_error = ENXIO;
642                         rc->rc_tried = 1;       /* don't even try */
643                         rc->rc_skipped = 1;
644                         continue;
645                 }
646 #if 0
647                 /*
648                  * Too hard for the bootcode
649                  */
650                 if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
651                         if (c >= nparity)
652                                 rm->rm_missingdata++;
653                         else
654                                 rm->rm_missingparity++;
655                         rc->rc_error = ESTALE;
656                         rc->rc_skipped = 1;
657                         continue;
658                 }
659 #endif
660                 if (c >= nparity || missingdata > 0) {
661                         if (rc->rc_data)
662                                 rc->rc_error = kid->v_read(kid, NULL,
663                                     rc->rc_data, rc->rc_offset, rc->rc_size);
664                         else
665                                 rc->rc_error = ENXIO;
666                         rc->rc_tried = 1;
667                         rc->rc_skipped = 0;
668                 }
669         }
670
671 reconstruct:
672         parity_errors = 0;
673         data_errors = 0;
674         unexpected_errors = 0;
675         total_errors = 0;
676         parity_untried = 0;
677         for (c = 0; c < acols; c++) {
678                 rc = &cols[c];
679
680                 if (rc->rc_error) {
681                         if (c < nparity)
682                                 parity_errors++;
683                         else
684                                 data_errors++;
685
686                         if (!rc->rc_skipped)
687                                 unexpected_errors++;
688
689                         total_errors++;
690                 } else if (c < nparity && !rc->rc_tried) {
691                         parity_untried++;
692                 }
693         }
694
695         /*
696          * There are three potential phases for a read:
697          *      1. produce valid data from the columns read
698          *      2. read all disks and try again
699          *      3. perform combinatorial reconstruction
700          *
701          * Each phase is progressively both more expensive and less
702          * likely to occur. If we encounter more errors than we can
703          * repair or all phases fail, we have no choice but to return
704          * an error.
705          */
706
707         /*
708          * If the number of errors we saw was correctable -- less than
709          * or equal to the number of parity disks read -- attempt to
710          * produce data that has a valid checksum. Naturally, this
711          * case applies in the absence of any errors.
712          */
713         if (total_errors <= nparity - parity_untried) {
714                 switch (data_errors) {
715                 case 0:
716                         if (zio_checksum_error(bp, buf) == 0)
717                                 return (0);
718                         break;
719
720                 case 1:
721                         /*
722                          * We either attempt to read all the parity columns or
723                          * none of them. If we didn't try to read parity, we
724                          * wouldn't be here in the correctable case. There must
725                          * also have been fewer parity errors than parity
726                          * columns or, again, we wouldn't be in this code path.
727                          */
728                         //ASSERT(parity_untried == 0);
729                         //ASSERT(parity_errors < nparity);
730
731                         /*
732                          * Find the column that reported the error.
733                          */
734                         for (c = nparity; c < acols; c++) {
735                                 rc = &cols[c];
736                                 if (rc->rc_error != 0)
737                                         break;
738                         }
739                         //ASSERT(c != acols);
740                         //ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
741
742                         if (cols[VDEV_RAIDZ_P].rc_error == 0) {
743                                 vdev_raidz_reconstruct_p(cols, nparity,
744                                     acols, c);
745                         } else {
746                                 //ASSERT(nparity > 1);
747                                 vdev_raidz_reconstruct_q(cols, nparity,
748                                     acols, c);
749                         }
750
751                         if (zio_checksum_error(bp, buf) == 0)
752                                 return (0);
753                         break;
754
755                 case 2:
756                         /*
757                          * Two data column errors require double parity.
758                          */
759                         //ASSERT(nparity == 2);
760
761                         /*
762                          * Find the two columns that reported errors.
763                          */
764                         for (c = nparity; c < acols; c++) {
765                                 rc = &cols[c];
766                                 if (rc->rc_error != 0)
767                                         break;
768                         }
769                         //ASSERT(c != acols);
770                         //ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
771
772                         for (c1 = c++; c < acols; c++) {
773                                 rc = &cols[c];
774                                 if (rc->rc_error != 0)
775                                         break;
776                         }
777                         //ASSERT(c != acols);
778                         //ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
779
780                         vdev_raidz_reconstruct_pq(cols, nparity, acols,
781                             c1, c);
782
783                         if (zio_checksum_error(bp, buf) == 0)
784                                 return (0);
785                         break;
786
787                 default:
788                         break;
789                         //ASSERT(nparity <= 2);
790                         //ASSERT(0);
791                 }
792         }
793
794         /*
795          * This isn't a typical situation -- either we got a read
796          * error or a child silently returned bad data. Read every
797          * block so we can try again with as much data and parity as
798          * we can track down. If we've already been through once
799          * before, all children will be marked as tried so we'll
800          * proceed to combinatorial reconstruction.
801          */
802         n = 0;
803         for (c = 0; c < acols; c++) {
804                 rc = &cols[c];
805                 if (rc->rc_tried)
806                         continue;
807
808                 devidx = rc->rc_devidx;
809                 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink)
810                         if (kid->v_id == devidx)
811                                 break;
812                 if (kid == NULL || kid->v_state != VDEV_STATE_HEALTHY) {
813                         rc->rc_error = ENXIO;
814                         rc->rc_tried = 1;       /* don't even try */
815                         rc->rc_skipped = 1;
816                         continue;
817                 }
818                 if (rc->rc_data)
819                         rc->rc_error = kid->v_read(kid, NULL,
820                             rc->rc_data, rc->rc_offset, rc->rc_size);
821                 else
822                         rc->rc_error = ENXIO;
823                 if (rc->rc_error == 0)
824                         n++;
825                 rc->rc_tried = 1;
826                 rc->rc_skipped = 0;
827         }
828
829         /*
830          * If we managed to read anything more, retry the
831          * reconstruction.
832          */
833         if (n)
834                 goto reconstruct;
835
836         /*
837          * At this point we've attempted to reconstruct the data given the
838          * errors we detected, and we've attempted to read all columns. There
839          * must, therefore, be one or more additional problems -- silent errors
840          * resulting in invalid data rather than explicit I/O errors resulting
841          * in absent data. Before we attempt combinatorial reconstruction make
842          * sure we have a chance of coming up with the right answer.
843          */
844         if (total_errors >= nparity) {
845                 return (EIO);
846         }
847
848         asize = 0;
849         for (c = 0; c < acols; c++) {
850                 rc = &cols[c];
851                 if (rc->rc_size > asize)
852                         asize = rc->rc_size;
853         }
854         if (cols[VDEV_RAIDZ_P].rc_error == 0) {
855                 /*
856                  * Attempt to reconstruct the data from parity P.
857                  */
858                 void *orig;
859                 orig = zfs_alloc_temp(asize);
860                 for (c = nparity; c < acols; c++) {
861                         rc = &cols[c];
862
863                         memcpy(orig, rc->rc_data, rc->rc_size);
864                         vdev_raidz_reconstruct_p(cols, nparity, acols, c);
865
866                         if (zio_checksum_error(bp, buf) == 0)
867                                 return (0);
868
869                         memcpy(rc->rc_data, orig, rc->rc_size);
870                 }
871         }
872
873         if (nparity > 1 && cols[VDEV_RAIDZ_Q].rc_error == 0) {
874                 /*
875                  * Attempt to reconstruct the data from parity Q.
876                  */
877                 void *orig;
878                 orig = zfs_alloc_temp(asize);
879                 for (c = nparity; c < acols; c++) {
880                         rc = &cols[c];
881
882                         memcpy(orig, rc->rc_data, rc->rc_size);
883                         vdev_raidz_reconstruct_q(cols, nparity, acols, c);
884
885                         if (zio_checksum_error(bp, buf) == 0)
886                                 return (0);
887
888                         memcpy(rc->rc_data, orig, rc->rc_size);
889                 }
890         }
891
892         if (nparity > 1 &&
893             cols[VDEV_RAIDZ_P].rc_error == 0 &&
894             cols[VDEV_RAIDZ_Q].rc_error == 0) {
895                 /*
896                  * Attempt to reconstruct the data from both P and Q.
897                  */
898                 void *orig, *orig1;
899                 orig = zfs_alloc_temp(asize);
900                 orig1 = zfs_alloc_temp(asize);
901                 for (c = nparity; c < acols - 1; c++) {
902                         rc = &cols[c];
903
904                         memcpy(orig, rc->rc_data, rc->rc_size);
905
906                         for (c1 = c + 1; c1 < acols; c1++) {
907                                 rc1 = &cols[c1];
908
909                                 memcpy(orig1, rc1->rc_data, rc1->rc_size);
910
911                                 vdev_raidz_reconstruct_pq(cols, nparity,
912                                     acols, c, c1);
913
914                                 if (zio_checksum_error(bp, buf) == 0)
915                                         return (0);
916
917                                 memcpy(rc1->rc_data, orig1, rc1->rc_size);
918                         }
919
920                         memcpy(rc->rc_data, orig, rc->rc_size);
921                 }
922         }
923
924         return (EIO);
925 }
926