]> CyberLeo.Net >> Repos - FreeBSD/releng/8.1.git/blob - sys/cddl/boot/zfs/zfssubr.c
Copy stable/8 to releng/8.1 in preparation for 8.1-RC1.
[FreeBSD/releng/8.1.git] / sys / cddl / boot / zfs / zfssubr.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
28
29 static uint64_t zfs_crc64_table[256];
30
31 static void
32 zfs_init_crc(void)
33 {
34         int i, j;
35         uint64_t *ct;
36
37         /*
38          * Calculate the crc64 table (used for the zap hash
39          * function).
40          */
41         if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
42                 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
43                 for (i = 0; i < 256; i++)
44                         for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
45                                 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
46         }
47 }
48
49 static void
50 zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
51 {
52         ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
53 }
54
55 /*
56  * Signature for checksum functions.
57  */
58 typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
59
60 /*
61  * Information about each checksum function.
62  */
63 typedef struct zio_checksum_info {
64         zio_checksum_t  *ci_func[2]; /* checksum function for each byteorder */
65         int             ci_correctable; /* number of correctable bits   */
66         int             ci_zbt;         /* uses zio block tail? */
67         const char      *ci_name;       /* descriptive name */
68 } zio_checksum_info_t;
69
70 #include "fletcher.c"
71 #include "sha256.c"
72
73 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
74         {{NULL,                 NULL},                  0, 0,   "inherit"},
75         {{NULL,                 NULL},                  0, 0,   "on"},
76         {{zio_checksum_off,     zio_checksum_off},      0, 0,   "off"},
77         {{zio_checksum_SHA256,  NULL},                  1, 1,   "label"},
78         {{zio_checksum_SHA256,  NULL},                  1, 1,   "gang_header"},
79         {{fletcher_2_native,    NULL},                  0, 1,   "zilog"},
80         {{fletcher_2_native,    NULL},                  0, 0,   "fletcher2"},
81         {{fletcher_4_native,    NULL},                  1, 0,   "fletcher4"},
82         {{zio_checksum_SHA256,  NULL},                  1, 0,   "SHA256"},
83 };
84
85 /*
86  * Common signature for all zio compress/decompress functions.
87  */
88 typedef size_t zio_compress_func_t(void *src, void *dst,
89     size_t s_len, size_t d_len, int);
90 typedef int zio_decompress_func_t(void *src, void *dst,
91     size_t s_len, size_t d_len, int);
92
93 /*
94  * Information about each compression function.
95  */
96 typedef struct zio_compress_info {
97         zio_compress_func_t     *ci_compress;   /* compression function */
98         zio_decompress_func_t   *ci_decompress; /* decompression function */
99         int                     ci_level;       /* level parameter */
100         const char              *ci_name;       /* algorithm name */
101 } zio_compress_info_t;
102
103 #include "lzjb.c"
104
105 /*
106  * Compression vectors.
107  */
108 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
109         {NULL,                  NULL,                   0,      "inherit"},
110         {NULL,                  NULL,                   0,      "on"},
111         {NULL,                  NULL,                   0,      "uncompressed"},
112         {NULL,                  lzjb_decompress,        0,      "lzjb"},
113         {NULL,                  NULL,                   0,      "empty"},
114         {NULL,                  NULL,                   1,      "gzip-1"},
115         {NULL,                  NULL,                   2,      "gzip-2"},
116         {NULL,                  NULL,                   3,      "gzip-3"},
117         {NULL,                  NULL,                   4,      "gzip-4"},
118         {NULL,                  NULL,                   5,      "gzip-5"},
119         {NULL,                  NULL,                   6,      "gzip-6"},
120         {NULL,                  NULL,                   7,      "gzip-7"},
121         {NULL,                  NULL,                   8,      "gzip-8"},
122         {NULL,                  NULL,                   9,      "gzip-9"},
123 };
124
125 static int
126 zio_checksum_error(const blkptr_t *bp, void *data)
127 {
128         zio_cksum_t zc = bp->blk_cksum;
129         unsigned int checksum = BP_GET_CHECKSUM(bp);
130         uint64_t size = BP_GET_PSIZE(bp);
131         zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
132         zio_checksum_info_t *ci = &zio_checksum_table[checksum];
133         zio_cksum_t actual_cksum, expected_cksum;
134
135         if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
136                 return (EINVAL);
137
138         if (ci->ci_zbt) {
139                 expected_cksum = zbt->zbt_cksum;
140                 zbt->zbt_cksum = zc;
141                 ci->ci_func[0](data, size, &actual_cksum);
142                 zbt->zbt_cksum = expected_cksum;
143                 zc = expected_cksum;
144         } else {
145                 /* ASSERT(!BP_IS_GANG(bp)); */
146                 ci->ci_func[0](data, size, &actual_cksum);
147         }
148
149         if (!ZIO_CHECKSUM_EQUAL(actual_cksum, zc)) {
150                 /*printf("ZFS: read checksum failed\n");*/
151                 return (EIO);
152         }
153
154         return (0);
155 }
156
157 static int
158 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
159         void *dest, uint64_t destsize)
160 {
161         zio_compress_info_t *ci = &zio_compress_table[cpfunc];
162
163         /* ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS); */
164         if (!ci->ci_decompress) {
165                 printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
166                 return (EIO);
167         }
168
169         return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
170 }
171
172 static uint64_t
173 zap_hash(uint64_t salt, const char *name)
174 {
175         const uint8_t *cp;
176         uint8_t c;
177         uint64_t crc = salt;
178
179         /*ASSERT(crc != 0);*/
180         /*ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);*/
181         for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
182                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
183
184         /*
185          * Only use 28 bits, since we need 4 bits in the cookie for the
186          * collision differentiator.  We MUST use the high bits, since
187          * those are the onces that we first pay attention to when
188          * chosing the bucket.
189          */
190         crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
191
192         return (crc);
193 }
194
195 static char *zfs_alloc_temp(size_t sz);
196
197 typedef struct raidz_col {
198         uint64_t rc_devidx;             /* child device index for I/O */
199         uint64_t rc_offset;             /* device offset */
200         uint64_t rc_size;               /* I/O size */
201         void *rc_data;                  /* I/O data */
202         int rc_error;                   /* I/O error for this device */
203         uint8_t rc_tried;               /* Did we attempt this I/O column? */
204         uint8_t rc_skipped;             /* Did we skip this I/O column? */
205 } raidz_col_t;
206
207 #define VDEV_RAIDZ_P            0
208 #define VDEV_RAIDZ_Q            1
209
210 static void
211 vdev_raidz_reconstruct_p(raidz_col_t *cols, int nparity, int acols, int x)
212 {
213         uint64_t *dst, *src, xcount, ccount, count, i;
214         int c;
215
216         xcount = cols[x].rc_size / sizeof (src[0]);
217         //ASSERT(xcount <= cols[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
218         //ASSERT(xcount > 0);
219
220         src = cols[VDEV_RAIDZ_P].rc_data;
221         dst = cols[x].rc_data;
222         for (i = 0; i < xcount; i++, dst++, src++) {
223                 *dst = *src;
224         }
225
226         for (c = nparity; c < acols; c++) {
227                 src = cols[c].rc_data;
228                 dst = cols[x].rc_data;
229
230                 if (c == x)
231                         continue;
232
233                 ccount = cols[c].rc_size / sizeof (src[0]);
234                 count = MIN(ccount, xcount);
235
236                 for (i = 0; i < count; i++, dst++, src++) {
237                         *dst ^= *src;
238                 }
239         }
240 }
241
242 /*
243  * These two tables represent powers and logs of 2 in the Galois field defined
244  * above. These values were computed by repeatedly multiplying by 2 as above.
245  */
246 static const uint8_t vdev_raidz_pow2[256] = {
247         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
248         0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
249         0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
250         0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
251         0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
252         0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
253         0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
254         0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
255         0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
256         0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
257         0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
258         0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
259         0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
260         0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
261         0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
262         0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
263         0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
264         0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
265         0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
266         0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
267         0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
268         0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
269         0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
270         0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
271         0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
272         0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
273         0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
274         0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
275         0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
276         0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
277         0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
278         0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
279 };
280 static const uint8_t vdev_raidz_log2[256] = {
281         0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
282         0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
283         0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
284         0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
285         0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
286         0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
287         0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
288         0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
289         0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
290         0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
291         0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
292         0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
293         0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
294         0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
295         0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
296         0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
297         0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
298         0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
299         0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
300         0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
301         0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
302         0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
303         0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
304         0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
305         0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
306         0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
307         0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
308         0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
309         0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
310         0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
311         0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
312         0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
313 };
314
315 /*
316  * Multiply a given number by 2 raised to the given power.
317  */
318 static uint8_t
319 vdev_raidz_exp2(uint8_t a, int exp)
320 {
321         if (a == 0)
322                 return (0);
323
324         //ASSERT(exp >= 0);
325         //ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
326
327         exp += vdev_raidz_log2[a];
328         if (exp > 255)
329                 exp -= 255;
330
331         return (vdev_raidz_pow2[exp]);
332 }
333
334 static void
335 vdev_raidz_generate_parity_pq(raidz_col_t *cols, int nparity, int acols)
336 {
337         uint64_t *q, *p, *src, pcount, ccount, mask, i;
338         int c;
339
340         pcount = cols[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
341         //ASSERT(cols[VDEV_RAIDZ_P].rc_size == cols[VDEV_RAIDZ_Q].rc_size);
342
343         for (c = nparity; c < acols; c++) {
344                 src = cols[c].rc_data;
345                 p = cols[VDEV_RAIDZ_P].rc_data;
346                 q = cols[VDEV_RAIDZ_Q].rc_data;
347                 ccount = cols[c].rc_size / sizeof (src[0]);
348
349                 if (c == nparity) {
350                         //ASSERT(ccount == pcount || ccount == 0);
351                         for (i = 0; i < ccount; i++, p++, q++, src++) {
352                                 *q = *src;
353                                 *p = *src;
354                         }
355                         for (; i < pcount; i++, p++, q++, src++) {
356                                 *q = 0;
357                                 *p = 0;
358                         }
359                 } else {
360                         //ASSERT(ccount <= pcount);
361
362                         /*
363                          * Rather than multiplying each byte
364                          * individually (as described above), we are
365                          * able to handle 8 at once by generating a
366                          * mask based on the high bit in each byte and
367                          * using that to conditionally XOR in 0x1d.
368                          */
369                         for (i = 0; i < ccount; i++, p++, q++, src++) {
370                                 mask = *q & 0x8080808080808080ULL;
371                                 mask = (mask << 1) - (mask >> 7);
372                                 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
373                                     (mask & 0x1d1d1d1d1d1d1d1dULL);
374                                 *q ^= *src;
375                                 *p ^= *src;
376                         }
377
378                         /*
379                          * Treat short columns as though they are full of 0s.
380                          */
381                         for (; i < pcount; i++, q++) {
382                                 mask = *q & 0x8080808080808080ULL;
383                                 mask = (mask << 1) - (mask >> 7);
384                                 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
385                                     (mask & 0x1d1d1d1d1d1d1d1dULL);
386                         }
387                 }
388         }
389 }
390
391 static void
392 vdev_raidz_reconstruct_q(raidz_col_t *cols, int nparity, int acols, int x)
393 {
394         uint64_t *dst, *src, xcount, ccount, count, mask, i;
395         uint8_t *b;
396         int c, j, exp;
397
398         xcount = cols[x].rc_size / sizeof (src[0]);
399         //ASSERT(xcount <= cols[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
400
401         for (c = nparity; c < acols; c++) {
402                 src = cols[c].rc_data;
403                 dst = cols[x].rc_data;
404
405                 if (c == x)
406                         ccount = 0;
407                 else
408                         ccount = cols[c].rc_size / sizeof (src[0]);
409
410                 count = MIN(ccount, xcount);
411
412                 if (c == nparity) {
413                         for (i = 0; i < count; i++, dst++, src++) {
414                                 *dst = *src;
415                         }
416                         for (; i < xcount; i++, dst++) {
417                                 *dst = 0;
418                         }
419
420                 } else {
421                         /*
422                          * For an explanation of this, see the comment in
423                          * vdev_raidz_generate_parity_pq() above.
424                          */
425                         for (i = 0; i < count; i++, dst++, src++) {
426                                 mask = *dst & 0x8080808080808080ULL;
427                                 mask = (mask << 1) - (mask >> 7);
428                                 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
429                                     (mask & 0x1d1d1d1d1d1d1d1dULL);
430                                 *dst ^= *src;
431                         }
432
433                         for (; i < xcount; i++, dst++) {
434                                 mask = *dst & 0x8080808080808080ULL;
435                                 mask = (mask << 1) - (mask >> 7);
436                                 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
437                                     (mask & 0x1d1d1d1d1d1d1d1dULL);
438                         }
439                 }
440         }
441
442         src = cols[VDEV_RAIDZ_Q].rc_data;
443         dst = cols[x].rc_data;
444         exp = 255 - (acols - 1 - x);
445
446         for (i = 0; i < xcount; i++, dst++, src++) {
447                 *dst ^= *src;
448                 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
449                         *b = vdev_raidz_exp2(*b, exp);
450                 }
451         }
452 }
453
454
455 static void
456 vdev_raidz_reconstruct_pq(raidz_col_t *cols, int nparity, int acols,
457     int x, int y, void *temp_p, void *temp_q)
458 {
459         uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
460         void *pdata, *qdata;
461         uint64_t xsize, ysize, i;
462
463         //ASSERT(x < y);
464         //ASSERT(x >= nparity);
465         //ASSERT(y < acols);
466
467         //ASSERT(cols[x].rc_size >= cols[y].rc_size);
468
469         /*
470          * Move the parity data aside -- we're going to compute parity as
471          * though columns x and y were full of zeros -- Pxy and Qxy. We want to
472          * reuse the parity generation mechanism without trashing the actual
473          * parity so we make those columns appear to be full of zeros by
474          * setting their lengths to zero.
475          */
476         pdata = cols[VDEV_RAIDZ_P].rc_data;
477         qdata = cols[VDEV_RAIDZ_Q].rc_data;
478         xsize = cols[x].rc_size;
479         ysize = cols[y].rc_size;
480
481         cols[VDEV_RAIDZ_P].rc_data = temp_p;
482         cols[VDEV_RAIDZ_Q].rc_data = temp_q;
483         cols[x].rc_size = 0;
484         cols[y].rc_size = 0;
485
486         vdev_raidz_generate_parity_pq(cols, nparity, acols);
487
488         cols[x].rc_size = xsize;
489         cols[y].rc_size = ysize;
490
491         p = pdata;
492         q = qdata;
493         pxy = cols[VDEV_RAIDZ_P].rc_data;
494         qxy = cols[VDEV_RAIDZ_Q].rc_data;
495         xd = cols[x].rc_data;
496         yd = cols[y].rc_data;
497
498         /*
499          * We now have:
500          *      Pxy = P + D_x + D_y
501          *      Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
502          *
503          * We can then solve for D_x:
504          *      D_x = A * (P + Pxy) + B * (Q + Qxy)
505          * where
506          *      A = 2^(x - y) * (2^(x - y) + 1)^-1
507          *      B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
508          *
509          * With D_x in hand, we can easily solve for D_y:
510          *      D_y = P + Pxy + D_x
511          */
512
513         a = vdev_raidz_pow2[255 + x - y];
514         b = vdev_raidz_pow2[255 - (acols - 1 - x)];
515         tmp = 255 - vdev_raidz_log2[a ^ 1];
516
517         aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
518         bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
519
520         for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
521                 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
522                     vdev_raidz_exp2(*q ^ *qxy, bexp);
523
524                 if (i < ysize)
525                         *yd = *p ^ *pxy ^ *xd;
526         }
527
528         /*
529          * Restore the saved parity data.
530          */
531         cols[VDEV_RAIDZ_P].rc_data = pdata;
532         cols[VDEV_RAIDZ_Q].rc_data = qdata;
533 }
534
535 static int
536 vdev_raidz_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
537     off_t offset, size_t bytes)
538 {
539         size_t psize = BP_GET_PSIZE(bp);
540         vdev_t *kid;
541         int unit_shift = vdev->v_ashift;
542         int dcols = vdev->v_nchildren;
543         int nparity = vdev->v_nparity;
544         int missingdata, missingparity;
545         int parity_errors, data_errors, unexpected_errors, total_errors;
546         int parity_untried;
547         uint64_t b = offset >> unit_shift;
548         uint64_t s = psize >> unit_shift;
549         uint64_t f = b % dcols;
550         uint64_t o = (b / dcols) << unit_shift;
551         uint64_t q, r, coff;
552         int c, c1, bc, col, acols, devidx, asize, n, max_rc_size;
553         static raidz_col_t cols[16];
554         raidz_col_t *rc, *rc1;
555         void *orig, *orig1, *temp_p, *temp_q;
556
557         orig = orig1 = temp_p = temp_q = NULL;
558
559         q = s / (dcols - nparity);
560         r = s - q * (dcols - nparity);
561         bc = (r == 0 ? 0 : r + nparity);
562
563         acols = (q == 0 ? bc : dcols);
564         asize = 0;
565         max_rc_size = 0;
566         
567         for (c = 0; c < acols; c++) {
568                 col = f + c;
569                 coff = o;
570                 if (col >= dcols) {
571                         col -= dcols;
572                         coff += 1ULL << unit_shift;
573                 }
574                 cols[c].rc_devidx = col;
575                 cols[c].rc_offset = coff;
576                 cols[c].rc_size = (q + (c < bc)) << unit_shift;
577                 cols[c].rc_data = NULL;
578                 cols[c].rc_error = 0;
579                 cols[c].rc_tried = 0;
580                 cols[c].rc_skipped = 0;
581                 asize += cols[c].rc_size;
582                 if (cols[c].rc_size > max_rc_size)
583                         max_rc_size = cols[c].rc_size;
584         }
585
586         asize = roundup(asize, (nparity + 1) << unit_shift);
587
588         for (c = 0; c < nparity; c++) {
589                 cols[c].rc_data = zfs_alloc_temp(cols[c].rc_size);
590         }
591
592         cols[c].rc_data = buf;
593
594         for (c = c + 1; c < acols; c++)
595                 cols[c].rc_data = (char *)cols[c - 1].rc_data +
596                     cols[c - 1].rc_size;
597
598         /*
599          * If all data stored spans all columns, there's a danger that
600          * parity will always be on the same device and, since parity
601          * isn't read during normal operation, that that device's I/O
602          * bandwidth won't be used effectively. We therefore switch
603          * the parity every 1MB.
604          *
605          * ... at least that was, ostensibly, the theory. As a
606          * practical matter unless we juggle the parity between all
607          * devices evenly, we won't see any benefit. Further,
608          * occasional writes that aren't a multiple of the LCM of the
609          * number of children and the minimum stripe width are
610          * sufficient to avoid pessimal behavior.  Unfortunately, this
611          * decision created an implicit on-disk format requirement
612          * that we need to support for all eternity, but only for
613          * single-parity RAID-Z.
614          */
615         //ASSERT(acols >= 2);
616         //ASSERT(cols[0].rc_size == cols[1].rc_size);
617
618         if (nparity == 1 && (offset & (1ULL << 20))) {
619                 devidx = cols[0].rc_devidx;
620                 o = cols[0].rc_offset;
621                 cols[0].rc_devidx = cols[1].rc_devidx;
622                 cols[0].rc_offset = cols[1].rc_offset;
623                 cols[1].rc_devidx = devidx;
624                 cols[1].rc_offset = o;
625         }
626
627         /*
628          * Iterate over the columns in reverse order so that we hit
629          * the parity last -- any errors along the way will force us
630          * to read the parity data.
631          */
632         missingdata = 0;
633         missingparity = 0;
634         for (c = acols - 1; c >= 0; c--) {
635                 rc = &cols[c];
636                 devidx = rc->rc_devidx;
637                 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink)
638                         if (kid->v_id == devidx)
639                                 break;
640                 if (kid == NULL || kid->v_state != VDEV_STATE_HEALTHY) {
641                         if (c >= nparity)
642                                 missingdata++;
643                         else
644                                 missingparity++;
645                         rc->rc_error = ENXIO;
646                         rc->rc_tried = 1;       /* don't even try */
647                         rc->rc_skipped = 1;
648                         continue;
649                 }
650 #if 0
651                 /*
652                  * Too hard for the bootcode
653                  */
654                 if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
655                         if (c >= nparity)
656                                 rm->rm_missingdata++;
657                         else
658                                 rm->rm_missingparity++;
659                         rc->rc_error = ESTALE;
660                         rc->rc_skipped = 1;
661                         continue;
662                 }
663 #endif
664                 if (c >= nparity || missingdata > 0) {
665                         if (rc->rc_data)
666                                 rc->rc_error = kid->v_read(kid, NULL,
667                                     rc->rc_data, rc->rc_offset, rc->rc_size);
668                         else
669                                 rc->rc_error = ENXIO;
670                         rc->rc_tried = 1;
671                         rc->rc_skipped = 0;
672                 }
673         }
674
675 reconstruct:
676         parity_errors = 0;
677         data_errors = 0;
678         unexpected_errors = 0;
679         total_errors = 0;
680         parity_untried = 0;
681         for (c = 0; c < acols; c++) {
682                 rc = &cols[c];
683
684                 if (rc->rc_error) {
685                         if (c < nparity)
686                                 parity_errors++;
687                         else
688                                 data_errors++;
689
690                         if (!rc->rc_skipped)
691                                 unexpected_errors++;
692
693                         total_errors++;
694                 } else if (c < nparity && !rc->rc_tried) {
695                         parity_untried++;
696                 }
697         }
698
699         /*
700          * There are three potential phases for a read:
701          *      1. produce valid data from the columns read
702          *      2. read all disks and try again
703          *      3. perform combinatorial reconstruction
704          *
705          * Each phase is progressively both more expensive and less
706          * likely to occur. If we encounter more errors than we can
707          * repair or all phases fail, we have no choice but to return
708          * an error.
709          */
710
711         /*
712          * If the number of errors we saw was correctable -- less than
713          * or equal to the number of parity disks read -- attempt to
714          * produce data that has a valid checksum. Naturally, this
715          * case applies in the absence of any errors.
716          */
717         if (total_errors <= nparity - parity_untried) {
718                 switch (data_errors) {
719                 case 0:
720                         if (zio_checksum_error(bp, buf) == 0)
721                                 return (0);
722                         break;
723
724                 case 1:
725                         /*
726                          * We either attempt to read all the parity columns or
727                          * none of them. If we didn't try to read parity, we
728                          * wouldn't be here in the correctable case. There must
729                          * also have been fewer parity errors than parity
730                          * columns or, again, we wouldn't be in this code path.
731                          */
732                         //ASSERT(parity_untried == 0);
733                         //ASSERT(parity_errors < nparity);
734
735                         /*
736                          * Find the column that reported the error.
737                          */
738                         for (c = nparity; c < acols; c++) {
739                                 rc = &cols[c];
740                                 if (rc->rc_error != 0)
741                                         break;
742                         }
743                         //ASSERT(c != acols);
744                         //ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
745
746                         if (cols[VDEV_RAIDZ_P].rc_error == 0) {
747                                 vdev_raidz_reconstruct_p(cols, nparity,
748                                     acols, c);
749                         } else {
750                                 //ASSERT(nparity > 1);
751                                 vdev_raidz_reconstruct_q(cols, nparity,
752                                     acols, c);
753                         }
754
755                         if (zio_checksum_error(bp, buf) == 0)
756                                 return (0);
757                         break;
758
759                 case 2:
760                         /*
761                          * Two data column errors require double parity.
762                          */
763                         //ASSERT(nparity == 2);
764
765                         /*
766                          * Find the two columns that reported errors.
767                          */
768                         for (c = nparity; c < acols; c++) {
769                                 rc = &cols[c];
770                                 if (rc->rc_error != 0)
771                                         break;
772                         }
773                         //ASSERT(c != acols);
774                         //ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
775
776                         for (c1 = c++; c < acols; c++) {
777                                 rc = &cols[c];
778                                 if (rc->rc_error != 0)
779                                         break;
780                         }
781                         //ASSERT(c != acols);
782                         //ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
783
784                         if (temp_p == NULL)
785                                 temp_p = zfs_alloc_temp(max_rc_size);
786                         if (temp_q == NULL)
787                                 temp_q = zfs_alloc_temp(max_rc_size);
788
789                         vdev_raidz_reconstruct_pq(cols, nparity, acols,
790                             c1, c, temp_p, temp_q);
791
792                         if (zio_checksum_error(bp, buf) == 0)
793                                 return (0);
794                         break;
795
796                 default:
797                         break;
798                         //ASSERT(nparity <= 2);
799                         //ASSERT(0);
800                 }
801         }
802
803         /*
804          * This isn't a typical situation -- either we got a read
805          * error or a child silently returned bad data. Read every
806          * block so we can try again with as much data and parity as
807          * we can track down. If we've already been through once
808          * before, all children will be marked as tried so we'll
809          * proceed to combinatorial reconstruction.
810          */
811         n = 0;
812         for (c = 0; c < acols; c++) {
813                 rc = &cols[c];
814                 if (rc->rc_tried)
815                         continue;
816
817                 devidx = rc->rc_devidx;
818                 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink)
819                         if (kid->v_id == devidx)
820                                 break;
821                 if (kid == NULL || kid->v_state != VDEV_STATE_HEALTHY) {
822                         rc->rc_error = ENXIO;
823                         rc->rc_tried = 1;       /* don't even try */
824                         rc->rc_skipped = 1;
825                         continue;
826                 }
827                 if (rc->rc_data)
828                         rc->rc_error = kid->v_read(kid, NULL,
829                             rc->rc_data, rc->rc_offset, rc->rc_size);
830                 else
831                         rc->rc_error = ENXIO;
832                 if (rc->rc_error == 0)
833                         n++;
834                 rc->rc_tried = 1;
835                 rc->rc_skipped = 0;
836         }
837
838         /*
839          * If we managed to read anything more, retry the
840          * reconstruction.
841          */
842         if (n)
843                 goto reconstruct;
844
845         /*
846          * At this point we've attempted to reconstruct the data given the
847          * errors we detected, and we've attempted to read all columns. There
848          * must, therefore, be one or more additional problems -- silent errors
849          * resulting in invalid data rather than explicit I/O errors resulting
850          * in absent data. Before we attempt combinatorial reconstruction make
851          * sure we have a chance of coming up with the right answer.
852          */
853         if (total_errors >= nparity) {
854                 return (EIO);
855         }
856
857         if (cols[VDEV_RAIDZ_P].rc_error == 0) {
858                 /*
859                  * Attempt to reconstruct the data from parity P.
860                  */
861                 if (orig == NULL)
862                         orig = zfs_alloc_temp(max_rc_size);
863                 for (c = nparity; c < acols; c++) {
864                         rc = &cols[c];
865
866                         memcpy(orig, rc->rc_data, rc->rc_size);
867                         vdev_raidz_reconstruct_p(cols, nparity, acols, c);
868
869                         if (zio_checksum_error(bp, buf) == 0)
870                                 return (0);
871
872                         memcpy(rc->rc_data, orig, rc->rc_size);
873                 }
874         }
875
876         if (nparity > 1 && cols[VDEV_RAIDZ_Q].rc_error == 0) {
877                 /*
878                  * Attempt to reconstruct the data from parity Q.
879                  */
880                 if (orig == NULL)
881                         orig = zfs_alloc_temp(max_rc_size);
882                 for (c = nparity; c < acols; c++) {
883                         rc = &cols[c];
884
885                         memcpy(orig, rc->rc_data, rc->rc_size);
886                         vdev_raidz_reconstruct_q(cols, nparity, acols, c);
887
888                         if (zio_checksum_error(bp, buf) == 0)
889                                 return (0);
890
891                         memcpy(rc->rc_data, orig, rc->rc_size);
892                 }
893         }
894
895         if (nparity > 1 &&
896             cols[VDEV_RAIDZ_P].rc_error == 0 &&
897             cols[VDEV_RAIDZ_Q].rc_error == 0) {
898                 /*
899                  * Attempt to reconstruct the data from both P and Q.
900                  */
901                 if (orig == NULL)
902                         orig = zfs_alloc_temp(max_rc_size);
903                 if (orig1 == NULL)
904                         orig1 = zfs_alloc_temp(max_rc_size);
905                 if (temp_p == NULL)
906                         temp_p = zfs_alloc_temp(max_rc_size);
907                 if (temp_q == NULL)
908                         temp_q = zfs_alloc_temp(max_rc_size);
909                 for (c = nparity; c < acols - 1; c++) {
910                         rc = &cols[c];
911
912                         memcpy(orig, rc->rc_data, rc->rc_size);
913
914                         for (c1 = c + 1; c1 < acols; c1++) {
915                                 rc1 = &cols[c1];
916
917                                 memcpy(orig1, rc1->rc_data, rc1->rc_size);
918
919                                 vdev_raidz_reconstruct_pq(cols, nparity,
920                                     acols, c, c1, temp_p, temp_q);
921
922                                 if (zio_checksum_error(bp, buf) == 0)
923                                         return (0);
924
925                                 memcpy(rc1->rc_data, orig1, rc1->rc_size);
926                         }
927
928                         memcpy(rc->rc_data, orig, rc->rc_size);
929                 }
930         }
931
932         return (EIO);
933 }
934