]> CyberLeo.Net >> Repos - FreeBSD/releng/10.0.git/blob - sys/geom/raid/md_promise.c
- Copy stable/10 (r259064) to releng/10.0 as part of the
[FreeBSD/releng/10.0.git] / sys / geom / raid / md_promise.c
1 /*-
2  * Copyright (c) 2011 Alexander Motin <mav@FreeBSD.org>
3  * Copyright (c) 2000 - 2008 Søren Schmidt <sos@FreeBSD.org>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30
31 #include <sys/param.h>
32 #include <sys/bio.h>
33 #include <sys/endian.h>
34 #include <sys/kernel.h>
35 #include <sys/kobj.h>
36 #include <sys/limits.h>
37 #include <sys/lock.h>
38 #include <sys/malloc.h>
39 #include <sys/mutex.h>
40 #include <sys/systm.h>
41 #include <geom/geom.h>
42 #include "geom/raid/g_raid.h"
43 #include "g_raid_md_if.h"
44
45 static MALLOC_DEFINE(M_MD_PROMISE, "md_promise_data", "GEOM_RAID Promise metadata");
46
47 #define PROMISE_MAX_DISKS       8
48 #define PROMISE_MAX_SUBDISKS    2
49 #define PROMISE_META_OFFSET     14
50
51 struct promise_raid_disk {
52         uint8_t         flags;                  /* Subdisk status. */
53 #define PROMISE_F_VALID         0x01
54 #define PROMISE_F_ONLINE        0x02
55 #define PROMISE_F_ASSIGNED      0x04
56 #define PROMISE_F_SPARE         0x08
57 #define PROMISE_F_DUPLICATE     0x10
58 #define PROMISE_F_REDIR         0x20
59 #define PROMISE_F_DOWN          0x40
60 #define PROMISE_F_READY         0x80
61
62         uint8_t         number;                 /* Position in a volume. */
63         uint8_t         channel;                /* ATA channel number. */
64         uint8_t         device;                 /* ATA device number. */
65         uint64_t        id __packed;            /* Subdisk ID. */
66 } __packed;
67
68 struct promise_raid_conf {
69         char            promise_id[24];
70 #define PROMISE_MAGIC           "Promise Technology, Inc."
71 #define FREEBSD_MAGIC           "FreeBSD ATA driver RAID "
72
73         uint32_t        dummy_0;
74         uint64_t        magic_0;
75 #define PROMISE_MAGIC0(x)       (((uint64_t)(x.channel) << 48) | \
76                                 ((uint64_t)(x.device != 0) << 56))
77         uint16_t        magic_1;
78         uint32_t        magic_2;
79         uint8_t         filler1[470];
80
81         uint32_t        integrity;
82 #define PROMISE_I_VALID         0x00000080
83
84         struct promise_raid_disk        disk;   /* This subdisk info. */
85         uint32_t        disk_offset;            /* Subdisk offset. */
86         uint32_t        disk_sectors;           /* Subdisk size */
87         uint32_t        disk_rebuild;           /* Rebuild position. */
88         uint16_t        generation;             /* Generation number. */
89         uint8_t         status;                 /* Volume status. */
90 #define PROMISE_S_VALID         0x01
91 #define PROMISE_S_ONLINE        0x02
92 #define PROMISE_S_INITED        0x04
93 #define PROMISE_S_READY         0x08
94 #define PROMISE_S_DEGRADED      0x10
95 #define PROMISE_S_MARKED        0x20
96 #define PROMISE_S_MIGRATING     0x40
97 #define PROMISE_S_FUNCTIONAL    0x80
98
99         uint8_t         type;                   /* Voluem type. */
100 #define PROMISE_T_RAID0         0x00
101 #define PROMISE_T_RAID1         0x01
102 #define PROMISE_T_RAID3         0x02
103 #define PROMISE_T_RAID5         0x04
104 #define PROMISE_T_SPAN          0x08
105 #define PROMISE_T_JBOD          0x10
106
107         uint8_t         total_disks;            /* Disks in this volume. */
108         uint8_t         stripe_shift;           /* Strip size. */
109         uint8_t         array_width;            /* Number of RAID0 stripes. */
110         uint8_t         array_number;           /* Global volume number. */
111         uint32_t        total_sectors;          /* Volume size. */
112         uint16_t        cylinders;              /* Volume geometry: C. */
113         uint8_t         heads;                  /* Volume geometry: H. */
114         uint8_t         sectors;                /* Volume geometry: S. */
115         uint64_t        volume_id __packed;     /* Volume ID, */
116         struct promise_raid_disk        disks[PROMISE_MAX_DISKS];
117                                                 /* Subdisks in this volume. */
118         char            name[32];               /* Volume label. */
119
120         uint32_t        filler2[8];
121         uint32_t        magic_3;        /* Something related to rebuild. */
122         uint64_t        rebuild_lba64;  /* Per-volume rebuild position. */
123         uint32_t        magic_4;
124         uint32_t        magic_5;
125         uint32_t        total_sectors_high;
126         uint8_t         magic_6;
127         uint8_t         sector_size;
128         uint16_t        magic_7;
129         uint32_t        magic_8[31];
130         uint32_t        backup_time;
131         uint16_t        magic_9;
132         uint32_t        disk_offset_high;
133         uint32_t        disk_sectors_high;
134         uint32_t        disk_rebuild_high;
135         uint16_t        magic_10;
136         uint32_t        magic_11[3];
137         uint32_t        filler3[284];
138         uint32_t        checksum;
139 } __packed;
140
141 struct g_raid_md_promise_perdisk {
142         int              pd_updated;
143         int              pd_subdisks;
144         struct promise_raid_conf        *pd_meta[PROMISE_MAX_SUBDISKS];
145 };
146
147 struct g_raid_md_promise_pervolume {
148         struct promise_raid_conf        *pv_meta;
149         uint64_t                         pv_id;
150         uint16_t                         pv_generation;
151         int                              pv_disks_present;
152         int                              pv_started;
153         struct callout                   pv_start_co;   /* STARTING state timer. */
154 };
155
156 static g_raid_md_create_t g_raid_md_create_promise;
157 static g_raid_md_taste_t g_raid_md_taste_promise;
158 static g_raid_md_event_t g_raid_md_event_promise;
159 static g_raid_md_volume_event_t g_raid_md_volume_event_promise;
160 static g_raid_md_ctl_t g_raid_md_ctl_promise;
161 static g_raid_md_write_t g_raid_md_write_promise;
162 static g_raid_md_fail_disk_t g_raid_md_fail_disk_promise;
163 static g_raid_md_free_disk_t g_raid_md_free_disk_promise;
164 static g_raid_md_free_volume_t g_raid_md_free_volume_promise;
165 static g_raid_md_free_t g_raid_md_free_promise;
166
167 static kobj_method_t g_raid_md_promise_methods[] = {
168         KOBJMETHOD(g_raid_md_create,    g_raid_md_create_promise),
169         KOBJMETHOD(g_raid_md_taste,     g_raid_md_taste_promise),
170         KOBJMETHOD(g_raid_md_event,     g_raid_md_event_promise),
171         KOBJMETHOD(g_raid_md_volume_event,      g_raid_md_volume_event_promise),
172         KOBJMETHOD(g_raid_md_ctl,       g_raid_md_ctl_promise),
173         KOBJMETHOD(g_raid_md_write,     g_raid_md_write_promise),
174         KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_promise),
175         KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_promise),
176         KOBJMETHOD(g_raid_md_free_volume,       g_raid_md_free_volume_promise),
177         KOBJMETHOD(g_raid_md_free,      g_raid_md_free_promise),
178         { 0, 0 }
179 };
180
181 static struct g_raid_md_class g_raid_md_promise_class = {
182         "Promise",
183         g_raid_md_promise_methods,
184         sizeof(struct g_raid_md_object),
185         .mdc_enable = 1,
186         .mdc_priority = 100
187 };
188
189
190 static void
191 g_raid_md_promise_print(struct promise_raid_conf *meta)
192 {
193         int i;
194
195         if (g_raid_debug < 1)
196                 return;
197
198         printf("********* ATA Promise Metadata *********\n");
199         printf("promise_id          <%.24s>\n", meta->promise_id);
200         printf("disk                %02x %02x %02x %02x %016jx\n",
201             meta->disk.flags, meta->disk.number, meta->disk.channel,
202             meta->disk.device, meta->disk.id);
203         printf("disk_offset         %u\n", meta->disk_offset);
204         printf("disk_sectors        %u\n", meta->disk_sectors);
205         printf("disk_rebuild        %u\n", meta->disk_rebuild);
206         printf("generation          %u\n", meta->generation);
207         printf("status              0x%02x\n", meta->status);
208         printf("type                %u\n", meta->type);
209         printf("total_disks         %u\n", meta->total_disks);
210         printf("stripe_shift        %u\n", meta->stripe_shift);
211         printf("array_width         %u\n", meta->array_width);
212         printf("array_number        %u\n", meta->array_number);
213         printf("total_sectors       %u\n", meta->total_sectors);
214         printf("cylinders           %u\n", meta->cylinders);
215         printf("heads               %u\n", meta->heads);
216         printf("sectors             %u\n", meta->sectors);
217         printf("volume_id           0x%016jx\n", meta->volume_id);
218         printf("disks:\n");
219         for (i = 0; i < PROMISE_MAX_DISKS; i++ ) {
220                 printf("                    %02x %02x %02x %02x %016jx\n",
221                     meta->disks[i].flags, meta->disks[i].number,
222                     meta->disks[i].channel, meta->disks[i].device,
223                     meta->disks[i].id);
224         }
225         printf("name                <%.32s>\n", meta->name);
226         printf("magic_3             0x%08x\n", meta->magic_3);
227         printf("rebuild_lba64       %ju\n", meta->rebuild_lba64);
228         printf("magic_4             0x%08x\n", meta->magic_4);
229         printf("magic_5             0x%08x\n", meta->magic_5);
230         printf("total_sectors_high  0x%08x\n", meta->total_sectors_high);
231         printf("sector_size         %u\n", meta->sector_size);
232         printf("backup_time         %d\n", meta->backup_time);
233         printf("disk_offset_high    0x%08x\n", meta->disk_offset_high);
234         printf("disk_sectors_high   0x%08x\n", meta->disk_sectors_high);
235         printf("disk_rebuild_high   0x%08x\n", meta->disk_rebuild_high);
236         printf("=================================================\n");
237 }
238
239 static struct promise_raid_conf *
240 promise_meta_copy(struct promise_raid_conf *meta)
241 {
242         struct promise_raid_conf *nmeta;
243
244         nmeta = malloc(sizeof(*nmeta), M_MD_PROMISE, M_WAITOK);
245         memcpy(nmeta, meta, sizeof(*nmeta));
246         return (nmeta);
247 }
248
249 static int
250 promise_meta_find_disk(struct promise_raid_conf *meta, uint64_t id)
251 {
252         int pos;
253
254         for (pos = 0; pos < meta->total_disks; pos++) {
255                 if (meta->disks[pos].id == id)
256                         return (pos);
257         }
258         return (-1);
259 }
260
261 static int
262 promise_meta_unused_range(struct promise_raid_conf **metaarr, int nsd,
263     off_t sectors, off_t *off, off_t *size)
264 {
265         off_t coff, csize, tmp;
266         int i, j;
267
268         sectors -= 131072;
269         *off = 0;
270         *size = 0;
271         coff = 0;
272         csize = sectors;
273         i = 0;
274         while (1) {
275                 for (j = 0; j < nsd; j++) {
276                         tmp = ((off_t)metaarr[j]->disk_offset_high << 32) +
277                             metaarr[j]->disk_offset;
278                         if (tmp >= coff)
279                                 csize = MIN(csize, tmp - coff);
280                 }
281                 if (csize > *size) {
282                         *off = coff;
283                         *size = csize;
284                 }
285                 if (i >= nsd)
286                         break;
287                 coff = ((off_t)metaarr[i]->disk_offset_high << 32) +
288                      metaarr[i]->disk_offset +
289                     ((off_t)metaarr[i]->disk_sectors_high << 32) +
290                      metaarr[i]->disk_sectors;
291                 csize = sectors - coff;
292                 i++;
293         };
294         return ((*size > 0) ? 1 : 0);
295 }
296
297 static int
298 promise_meta_translate_disk(struct g_raid_volume *vol, int md_disk_pos)
299 {
300         int disk_pos, width;
301
302         if (md_disk_pos >= 0 && vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) {
303                 width = vol->v_disks_count / 2;
304                 disk_pos = (md_disk_pos / width) +
305                     (md_disk_pos % width) * width;
306         } else
307                 disk_pos = md_disk_pos;
308         return (disk_pos);
309 }
310
311 static void
312 promise_meta_get_name(struct promise_raid_conf *meta, char *buf)
313 {
314         int i;
315
316         strncpy(buf, meta->name, 32);
317         buf[32] = 0;
318         for (i = 31; i >= 0; i--) {
319                 if (buf[i] > 0x20)
320                         break;
321                 buf[i] = 0;
322         }
323 }
324
325 static void
326 promise_meta_put_name(struct promise_raid_conf *meta, char *buf)
327 {
328
329         memset(meta->name, 0x20, 32);
330         memcpy(meta->name, buf, MIN(strlen(buf), 32));
331 }
332
333 static int
334 promise_meta_read(struct g_consumer *cp, struct promise_raid_conf **metaarr)
335 {
336         struct g_provider *pp;
337         struct promise_raid_conf *meta;
338         char *buf;
339         int error, i, subdisks;
340         uint32_t checksum, *ptr;
341
342         pp = cp->provider;
343         subdisks = 0;
344 next:
345         /* Read metadata block. */
346         buf = g_read_data(cp, pp->mediasize - pp->sectorsize *
347             (63 - subdisks * PROMISE_META_OFFSET),
348             pp->sectorsize * 4, &error);
349         if (buf == NULL) {
350                 G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).",
351                     pp->name, error);
352                 return (subdisks);
353         }
354         meta = (struct promise_raid_conf *)buf;
355
356         /* Check if this is an Promise RAID struct */
357         if (strncmp(meta->promise_id, PROMISE_MAGIC, strlen(PROMISE_MAGIC)) &&
358             strncmp(meta->promise_id, FREEBSD_MAGIC, strlen(FREEBSD_MAGIC))) {
359                 if (subdisks == 0)
360                         G_RAID_DEBUG(1,
361                             "Promise signature check failed on %s", pp->name);
362                 g_free(buf);
363                 return (subdisks);
364         }
365         meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK);
366         memcpy(meta, buf, MIN(sizeof(*meta), pp->sectorsize * 4));
367         g_free(buf);
368
369         /* Check metadata checksum. */
370         for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++)
371                 checksum += *ptr++;
372         if (checksum != meta->checksum) {
373                 G_RAID_DEBUG(1, "Promise checksum check failed on %s", pp->name);
374                 free(meta, M_MD_PROMISE);
375                 return (subdisks);
376         }
377
378         if ((meta->integrity & PROMISE_I_VALID) == 0) {
379                 G_RAID_DEBUG(1, "Promise metadata is invalid on %s", pp->name);
380                 free(meta, M_MD_PROMISE);
381                 return (subdisks);
382         }
383
384         if (meta->total_disks > PROMISE_MAX_DISKS) {
385                 G_RAID_DEBUG(1, "Wrong number of disks on %s (%d)",
386                     pp->name, meta->total_disks);
387                 free(meta, M_MD_PROMISE);
388                 return (subdisks);
389         }
390
391         /* Remove filler garbage from fields used in newer metadata. */
392         if (meta->disk_offset_high == 0x8b8c8d8e &&
393             meta->disk_sectors_high == 0x8788898a &&
394             meta->disk_rebuild_high == 0x83848586) {
395                 meta->disk_offset_high = 0;
396                 meta->disk_sectors_high = 0;
397                 if (meta->disk_rebuild == UINT32_MAX)
398                         meta->disk_rebuild_high = UINT32_MAX;
399                 else
400                         meta->disk_rebuild_high = 0;
401                 if (meta->total_sectors_high == 0x15161718) {
402                         meta->total_sectors_high = 0;
403                         meta->backup_time = 0;
404                         if (meta->rebuild_lba64 == 0x2122232425262728)
405                                 meta->rebuild_lba64 = UINT64_MAX;
406                 }
407         }
408         if (meta->sector_size < 1 || meta->sector_size > 8)
409                 meta->sector_size = 1;
410
411         /* Save this part and look for next. */
412         *metaarr = meta;
413         metaarr++;
414         subdisks++;
415         if (subdisks < PROMISE_MAX_SUBDISKS)
416                 goto next;
417
418         return (subdisks);
419 }
420
421 static int
422 promise_meta_write(struct g_consumer *cp,
423     struct promise_raid_conf **metaarr, int nsd)
424 {
425         struct g_provider *pp;
426         struct promise_raid_conf *meta;
427         char *buf;
428         off_t off, size;
429         int error, i, subdisk, fake;
430         uint32_t checksum, *ptr;
431
432         pp = cp->provider;
433         subdisk = 0;
434         fake = 0;
435 next:
436         buf = malloc(pp->sectorsize * 4, M_MD_PROMISE, M_WAITOK | M_ZERO);
437         meta = NULL;
438         if (subdisk < nsd) {
439                 meta = metaarr[subdisk];
440         } else if (!fake && promise_meta_unused_range(metaarr, nsd,
441             cp->provider->mediasize / cp->provider->sectorsize,
442             &off, &size)) {
443                 /* Optionally add record for unused space. */
444                 meta = (struct promise_raid_conf *)buf;
445                 memcpy(&meta->promise_id[0], PROMISE_MAGIC,
446                     sizeof(PROMISE_MAGIC) - 1);
447                 meta->dummy_0 = 0x00020000;
448                 meta->integrity = PROMISE_I_VALID;
449                 meta->disk.flags = PROMISE_F_ONLINE | PROMISE_F_VALID;
450                 meta->disk.number = 0xff;
451                 arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0);
452                 meta->disk_offset_high = off >> 32;
453                 meta->disk_offset = (uint32_t)off;
454                 meta->disk_sectors_high = size >> 32;
455                 meta->disk_sectors = (uint32_t)size;
456                 meta->disk_rebuild_high = UINT32_MAX;
457                 meta->disk_rebuild = UINT32_MAX;
458                 fake = 1;
459         }
460         if (meta != NULL) {
461                 /* Recalculate checksum for case if metadata were changed. */
462                 meta->checksum = 0;
463                 for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++)
464                         checksum += *ptr++;
465                 meta->checksum = checksum;
466                 memcpy(buf, meta, MIN(pp->sectorsize * 4, sizeof(*meta)));
467         }
468         error = g_write_data(cp, pp->mediasize - pp->sectorsize *
469             (63 - subdisk * PROMISE_META_OFFSET),
470             buf, pp->sectorsize * 4);
471         if (error != 0) {
472                 G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).",
473                     pp->name, error);
474         }
475         free(buf, M_MD_PROMISE);
476
477         subdisk++;
478         if (subdisk < PROMISE_MAX_SUBDISKS)
479                 goto next;
480
481         return (error);
482 }
483
484 static int
485 promise_meta_erase(struct g_consumer *cp)
486 {
487         struct g_provider *pp;
488         char *buf;
489         int error, subdisk;
490
491         pp = cp->provider;
492         buf = malloc(4 * pp->sectorsize, M_MD_PROMISE, M_WAITOK | M_ZERO);
493         for (subdisk = 0; subdisk < PROMISE_MAX_SUBDISKS; subdisk++) {
494                 error = g_write_data(cp, pp->mediasize - pp->sectorsize *
495                     (63 - subdisk * PROMISE_META_OFFSET),
496                     buf, 4 * pp->sectorsize);
497                 if (error != 0) {
498                         G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).",
499                             pp->name, error);
500                 }
501         }
502         free(buf, M_MD_PROMISE);
503         return (error);
504 }
505
506 static int
507 promise_meta_write_spare(struct g_consumer *cp)
508 {
509         struct promise_raid_conf *meta;
510         off_t tmp;
511         int error;
512
513         meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK | M_ZERO);
514         memcpy(&meta->promise_id[0], PROMISE_MAGIC, sizeof(PROMISE_MAGIC) - 1);
515         meta->dummy_0 = 0x00020000;
516         meta->integrity = PROMISE_I_VALID;
517         meta->disk.flags = PROMISE_F_SPARE | PROMISE_F_ONLINE | PROMISE_F_VALID;
518         meta->disk.number = 0xff;
519         arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0);
520         tmp = cp->provider->mediasize / cp->provider->sectorsize - 131072;
521         meta->disk_sectors_high = tmp >> 32;
522         meta->disk_sectors = (uint32_t)tmp;
523         meta->disk_rebuild_high = UINT32_MAX;
524         meta->disk_rebuild = UINT32_MAX;
525         error = promise_meta_write(cp, &meta, 1);
526         free(meta, M_MD_PROMISE);
527         return (error);
528 }
529
530 static struct g_raid_volume *
531 g_raid_md_promise_get_volume(struct g_raid_softc *sc, uint64_t id)
532 {
533         struct g_raid_volume    *vol;
534         struct g_raid_md_promise_pervolume *pv;
535
536         TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
537                 pv = vol->v_md_data;
538                 if (pv->pv_id == id)
539                         break;
540         }
541         return (vol);
542 }
543
544 static int
545 g_raid_md_promise_purge_volumes(struct g_raid_softc *sc)
546 {
547         struct g_raid_volume    *vol, *tvol;
548         struct g_raid_md_promise_pervolume *pv;
549         int i, res;
550
551         res = 0;
552         TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tvol) {
553                 pv = vol->v_md_data;
554                 if (!pv->pv_started || vol->v_stopping)
555                         continue;
556                 for (i = 0; i < vol->v_disks_count; i++) {
557                         if (vol->v_subdisks[i].sd_state != G_RAID_SUBDISK_S_NONE)
558                                 break;
559                 }
560                 if (i >= vol->v_disks_count) {
561                         g_raid_destroy_volume(vol);
562                         res = 1;
563                 }
564         }
565         return (res);
566 }
567
568 static int
569 g_raid_md_promise_purge_disks(struct g_raid_softc *sc)
570 {
571         struct g_raid_disk      *disk, *tdisk;
572         struct g_raid_volume    *vol;
573         struct g_raid_md_promise_perdisk *pd;
574         int i, j, res;
575
576         res = 0;
577         TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) {
578                 if (disk->d_state == G_RAID_DISK_S_SPARE)
579                         continue;
580                 pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
581
582                 /* Scan for deleted volumes. */
583                 for (i = 0; i < pd->pd_subdisks; ) {
584                         vol = g_raid_md_promise_get_volume(sc,
585                             pd->pd_meta[i]->volume_id);
586                         if (vol != NULL && !vol->v_stopping) {
587                                 i++;
588                                 continue;
589                         }
590                         free(pd->pd_meta[i], M_MD_PROMISE);
591                         for (j = i; j < pd->pd_subdisks - 1; j++)
592                                 pd->pd_meta[j] = pd->pd_meta[j + 1];
593                         pd->pd_meta[pd->pd_subdisks - 1] = NULL;
594                         pd->pd_subdisks--;
595                         pd->pd_updated = 1;
596                 }
597
598                 /* If there is no metadata left - erase and delete disk. */
599                 if (pd->pd_subdisks == 0) {
600                         promise_meta_erase(disk->d_consumer);
601                         g_raid_destroy_disk(disk);
602                         res = 1;
603                 }
604         }
605         return (res);
606 }
607
608 static int
609 g_raid_md_promise_supported(int level, int qual, int disks, int force)
610 {
611
612         if (disks > PROMISE_MAX_DISKS)
613                 return (0);
614         switch (level) {
615         case G_RAID_VOLUME_RL_RAID0:
616                 if (disks < 1)
617                         return (0);
618                 if (!force && disks < 2)
619                         return (0);
620                 break;
621         case G_RAID_VOLUME_RL_RAID1:
622                 if (disks < 1)
623                         return (0);
624                 if (!force && (disks != 2))
625                         return (0);
626                 break;
627         case G_RAID_VOLUME_RL_RAID1E:
628                 if (disks < 2)
629                         return (0);
630                 if (disks % 2 != 0)
631                         return (0);
632                 if (!force && (disks != 4))
633                         return (0);
634                 break;
635         case G_RAID_VOLUME_RL_SINGLE:
636                 if (disks != 1)
637                         return (0);
638                 break;
639         case G_RAID_VOLUME_RL_CONCAT:
640                 if (disks < 2)
641                         return (0);
642                 break;
643         case G_RAID_VOLUME_RL_RAID5:
644                 if (disks < 3)
645                         return (0);
646                 if (qual != G_RAID_VOLUME_RLQ_R5LA)
647                         return (0);
648                 break;
649         default:
650                 return (0);
651         }
652         if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE)
653                 return (0);
654         return (1);
655 }
656
657 static int
658 g_raid_md_promise_start_disk(struct g_raid_disk *disk, int sdn,
659     struct g_raid_volume *vol)
660 {
661         struct g_raid_softc *sc;
662         struct g_raid_subdisk *sd;
663         struct g_raid_md_promise_perdisk *pd;
664         struct g_raid_md_promise_pervolume *pv;
665         struct promise_raid_conf *meta;
666         off_t eoff, esize, size;
667         int disk_pos, md_disk_pos, i, resurrection = 0;
668
669         sc = disk->d_softc;
670         pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
671
672         pv = vol->v_md_data;
673         meta = pv->pv_meta;
674
675         if (sdn >= 0) {
676                 /* Find disk position in metadata by it's serial. */
677                 md_disk_pos = promise_meta_find_disk(meta, pd->pd_meta[sdn]->disk.id);
678                 /* For RAID0+1 we need to translate order. */
679                 disk_pos = promise_meta_translate_disk(vol, md_disk_pos);
680         } else {
681                 md_disk_pos = -1;
682                 disk_pos = -1;
683         }
684         if (disk_pos < 0) {
685                 G_RAID_DEBUG1(1, sc, "Disk %s is not part of the volume %s",
686                     g_raid_get_diskname(disk), vol->v_name);
687                 /* Failed stale disk is useless for us. */
688                 if (sdn >= 0 &&
689                     pd->pd_meta[sdn]->disk.flags & PROMISE_F_DOWN) {
690                         g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED);
691                         return (0);
692                 }
693                 /* If we were given specific metadata subdisk - erase it. */
694                 if (sdn >= 0) {
695                         free(pd->pd_meta[sdn], M_MD_PROMISE);
696                         for (i = sdn; i < pd->pd_subdisks - 1; i++)
697                                 pd->pd_meta[i] = pd->pd_meta[i + 1];
698                         pd->pd_meta[pd->pd_subdisks - 1] = NULL;
699                         pd->pd_subdisks--;
700                 }
701                 /* If we are in the start process, that's all for now. */
702                 if (!pv->pv_started)
703                         goto nofit;
704                 /*
705                  * If we have already started - try to get use of the disk.
706                  * Try to replace OFFLINE disks first, then FAILED.
707                  */
708                 promise_meta_unused_range(pd->pd_meta, pd->pd_subdisks,
709                     disk->d_consumer->provider->mediasize /
710                     disk->d_consumer->provider->sectorsize,
711                     &eoff, &esize);
712                 if (esize == 0) {
713                         G_RAID_DEBUG1(1, sc, "No free space on disk %s",
714                             g_raid_get_diskname(disk));
715                         goto nofit;
716                 }
717                 size = INT64_MAX;
718                 for (i = 0; i < vol->v_disks_count; i++) {
719                         sd = &vol->v_subdisks[i];
720                         if (sd->sd_state != G_RAID_SUBDISK_S_NONE)
721                                 size = sd->sd_size;
722                         if (sd->sd_state <= G_RAID_SUBDISK_S_FAILED &&
723                             (disk_pos < 0 ||
724                              vol->v_subdisks[i].sd_state < sd->sd_state))
725                                 disk_pos = i;
726                 }
727                 if (disk_pos >= 0 &&
728                     vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT &&
729                     (off_t)esize * 512 < size) {
730                         G_RAID_DEBUG1(1, sc, "Disk %s free space "
731                             "is too small (%ju < %ju)",
732                             g_raid_get_diskname(disk),
733                             (off_t)esize * 512, size);
734                         disk_pos = -1;
735                 }
736                 if (disk_pos >= 0) {
737                         if (vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT)
738                                 esize = size / 512;
739                         /* For RAID0+1 we need to translate order. */
740                         md_disk_pos = promise_meta_translate_disk(vol, disk_pos);
741                 } else {
742 nofit:
743                         if (pd->pd_subdisks == 0) {
744                                 g_raid_change_disk_state(disk,
745                                     G_RAID_DISK_S_SPARE);
746                         }
747                         return (0);
748                 }
749                 G_RAID_DEBUG1(1, sc, "Disk %s takes pos %d in the volume %s",
750                     g_raid_get_diskname(disk), disk_pos, vol->v_name);
751                 resurrection = 1;
752         }
753
754         sd = &vol->v_subdisks[disk_pos];
755
756         if (resurrection && sd->sd_disk != NULL) {
757                 g_raid_change_disk_state(sd->sd_disk,
758                     G_RAID_DISK_S_STALE_FAILED);
759                 TAILQ_REMOVE(&sd->sd_disk->d_subdisks,
760                     sd, sd_next);
761         }
762         vol->v_subdisks[disk_pos].sd_disk = disk;
763         TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
764
765         /* Welcome the new disk. */
766         if (resurrection)
767                 g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
768         else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN)
769                 g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED);
770         else
771                 g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
772
773         if (resurrection) {
774                 sd->sd_offset = (off_t)eoff * 512;
775                 sd->sd_size = (off_t)esize * 512;
776         } else {
777                 sd->sd_offset = (((off_t)pd->pd_meta[sdn]->disk_offset_high
778                     << 32) + pd->pd_meta[sdn]->disk_offset) * 512;
779                 sd->sd_size = (((off_t)pd->pd_meta[sdn]->disk_sectors_high
780                     << 32) + pd->pd_meta[sdn]->disk_sectors) * 512;
781         }
782
783         if (resurrection) {
784                 /* Stale disk, almost same as new. */
785                 g_raid_change_subdisk_state(sd,
786                     G_RAID_SUBDISK_S_NEW);
787         } else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN) {
788                 /* Failed disk. */
789                 g_raid_change_subdisk_state(sd,
790                     G_RAID_SUBDISK_S_FAILED);
791         } else if (meta->disks[md_disk_pos].flags & PROMISE_F_REDIR) {
792                 /* Rebuilding disk. */
793                 g_raid_change_subdisk_state(sd,
794                     G_RAID_SUBDISK_S_REBUILD);
795                 if (pd->pd_meta[sdn]->generation != meta->generation)
796                         sd->sd_rebuild_pos = 0;
797                 else {
798                         sd->sd_rebuild_pos =
799                             (((off_t)pd->pd_meta[sdn]->disk_rebuild_high << 32) +
800                              pd->pd_meta[sdn]->disk_rebuild) * 512;
801                 }
802         } else if (!(meta->disks[md_disk_pos].flags & PROMISE_F_ONLINE)) {
803                 /* Rebuilding disk. */
804                 g_raid_change_subdisk_state(sd,
805                     G_RAID_SUBDISK_S_NEW);
806         } else if (pd->pd_meta[sdn]->generation != meta->generation ||
807             (meta->status & PROMISE_S_MARKED)) {
808                 /* Stale disk or dirty volume (unclean shutdown). */
809                 g_raid_change_subdisk_state(sd,
810                     G_RAID_SUBDISK_S_STALE);
811         } else {
812                 /* Up to date disk. */
813                 g_raid_change_subdisk_state(sd,
814                     G_RAID_SUBDISK_S_ACTIVE);
815         }
816         g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
817             G_RAID_EVENT_SUBDISK);
818
819         return (resurrection);
820 }
821
822 static void
823 g_raid_md_promise_refill(struct g_raid_softc *sc)
824 {
825         struct g_raid_volume *vol;
826         struct g_raid_subdisk *sd;
827         struct g_raid_disk *disk;
828         struct g_raid_md_object *md;
829         struct g_raid_md_promise_perdisk *pd;
830         struct g_raid_md_promise_pervolume *pv;
831         int update, updated, i, bad;
832
833         md = sc->sc_md;
834 restart:
835         updated = 0;
836         TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
837                 pv = vol->v_md_data;
838                 if (!pv->pv_started || vol->v_stopping)
839                         continue;
840
841                 /* Search for subdisk that needs replacement. */
842                 bad = 0;
843                 for (i = 0; i < vol->v_disks_count; i++) {
844                         sd = &vol->v_subdisks[i];
845                         if (sd->sd_state == G_RAID_SUBDISK_S_NONE ||
846                             sd->sd_state == G_RAID_SUBDISK_S_FAILED)
847                                 bad = 1;
848                 }
849                 if (!bad)
850                         continue;
851
852                 G_RAID_DEBUG1(1, sc, "Volume %s is not complete, "
853                     "trying to refill.", vol->v_name);
854
855                 TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
856                         /* Skip failed. */
857                         if (disk->d_state < G_RAID_DISK_S_SPARE)
858                                 continue;
859                         /* Skip already used by this volume. */
860                         for (i = 0; i < vol->v_disks_count; i++) {
861                                 sd = &vol->v_subdisks[i];
862                                 if (sd->sd_disk == disk)
863                                         break;
864                         }
865                         if (i < vol->v_disks_count)
866                                 continue;
867
868                         /* Try to use disk if it has empty extents. */
869                         pd = disk->d_md_data;
870                         if (pd->pd_subdisks < PROMISE_MAX_SUBDISKS) {
871                                 update =
872                                     g_raid_md_promise_start_disk(disk, -1, vol);
873                         } else
874                                 update = 0;
875                         if (update) {
876                                 updated = 1;
877                                 g_raid_md_write_promise(md, vol, NULL, disk);
878                                 break;
879                         }
880                 }
881         }
882         if (updated)
883                 goto restart;
884 }
885
886 static void
887 g_raid_md_promise_start(struct g_raid_volume *vol)
888 {
889         struct g_raid_softc *sc;
890         struct g_raid_subdisk *sd;
891         struct g_raid_disk *disk;
892         struct g_raid_md_object *md;
893         struct g_raid_md_promise_perdisk *pd;
894         struct g_raid_md_promise_pervolume *pv;
895         struct promise_raid_conf *meta;
896         int i;
897
898         sc = vol->v_softc;
899         md = sc->sc_md;
900         pv = vol->v_md_data;
901         meta = pv->pv_meta;
902
903         vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE;
904         if (meta->type == PROMISE_T_RAID0)
905                 vol->v_raid_level = G_RAID_VOLUME_RL_RAID0;
906         else if (meta->type == PROMISE_T_RAID1) {
907                 if (meta->array_width == 1)
908                         vol->v_raid_level = G_RAID_VOLUME_RL_RAID1;
909                 else
910                         vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E;
911         } else if (meta->type == PROMISE_T_RAID3)
912                 vol->v_raid_level = G_RAID_VOLUME_RL_RAID3;
913         else if (meta->type == PROMISE_T_RAID5) {
914                 vol->v_raid_level = G_RAID_VOLUME_RL_RAID5;
915                 vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA;
916         } else if (meta->type == PROMISE_T_SPAN)
917                 vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT;
918         else if (meta->type == PROMISE_T_JBOD)
919                 vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE;
920         else
921                 vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
922         vol->v_strip_size = 512 << meta->stripe_shift; //ZZZ
923         vol->v_disks_count = meta->total_disks;
924         vol->v_mediasize = (off_t)meta->total_sectors * 512; //ZZZ
925         if (meta->total_sectors_high < 256) /* If value looks sane. */
926                 vol->v_mediasize +=
927                     ((off_t)meta->total_sectors_high << 32) * 512; //ZZZ
928         vol->v_sectorsize = 512 * meta->sector_size;
929         for (i = 0; i < vol->v_disks_count; i++) {
930                 sd = &vol->v_subdisks[i];
931                 sd->sd_offset = (((off_t)meta->disk_offset_high << 32) +
932                     meta->disk_offset) * 512;
933                 sd->sd_size = (((off_t)meta->disk_sectors_high << 32) +
934                     meta->disk_sectors) * 512;
935         }
936         g_raid_start_volume(vol);
937
938         /* Make all disks found till the moment take their places. */
939         TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
940                 pd = disk->d_md_data;
941                 for (i = 0; i < pd->pd_subdisks; i++) {
942                         if (pd->pd_meta[i]->volume_id == meta->volume_id)
943                                 g_raid_md_promise_start_disk(disk, i, vol);
944                 }
945         }
946
947         pv->pv_started = 1;
948         callout_stop(&pv->pv_start_co);
949         G_RAID_DEBUG1(0, sc, "Volume started.");
950         g_raid_md_write_promise(md, vol, NULL, NULL);
951
952         /* Pickup any STALE/SPARE disks to refill array if needed. */
953         g_raid_md_promise_refill(sc);
954
955         g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME);
956 }
957
958 static void
959 g_raid_promise_go(void *arg)
960 {
961         struct g_raid_volume *vol;
962         struct g_raid_softc *sc;
963         struct g_raid_md_promise_pervolume *pv;
964
965         vol = arg;
966         pv = vol->v_md_data;
967         sc = vol->v_softc;
968         if (!pv->pv_started) {
969                 G_RAID_DEBUG1(0, sc, "Force volume start due to timeout.");
970                 g_raid_event_send(vol, G_RAID_VOLUME_E_STARTMD,
971                     G_RAID_EVENT_VOLUME);
972         }
973 }
974
975 static void
976 g_raid_md_promise_new_disk(struct g_raid_disk *disk)
977 {
978         struct g_raid_softc *sc;
979         struct g_raid_md_object *md;
980         struct promise_raid_conf *pdmeta;
981         struct g_raid_md_promise_perdisk *pd;
982         struct g_raid_md_promise_pervolume *pv;
983         struct g_raid_volume *vol;
984         int i;
985         char buf[33];
986
987         sc = disk->d_softc;
988         md = sc->sc_md;
989         pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
990
991         if (pd->pd_subdisks == 0) {
992                 g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
993                 g_raid_md_promise_refill(sc);
994                 return;
995         }
996
997         for (i = 0; i < pd->pd_subdisks; i++) {
998                 pdmeta = pd->pd_meta[i];
999
1000                 /* Look for volume with matching ID. */
1001                 vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id);
1002                 if (vol == NULL) {
1003                         promise_meta_get_name(pdmeta, buf);
1004                         vol = g_raid_create_volume(sc, buf, pdmeta->array_number);
1005                         pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK | M_ZERO);
1006                         pv->pv_id = pdmeta->volume_id;
1007                         vol->v_md_data = pv;
1008                         callout_init(&pv->pv_start_co, 1);
1009                         callout_reset(&pv->pv_start_co,
1010                             g_raid_start_timeout * hz,
1011                             g_raid_promise_go, vol);
1012                 } else
1013                         pv = vol->v_md_data;
1014
1015                 /* If we haven't started yet - check metadata freshness. */
1016                 if (pv->pv_meta == NULL || !pv->pv_started) {
1017                         if (pv->pv_meta == NULL ||
1018                             ((int16_t)(pdmeta->generation - pv->pv_generation)) > 0) {
1019                                 G_RAID_DEBUG1(1, sc, "Newer disk");
1020                                 if (pv->pv_meta != NULL)
1021                                         free(pv->pv_meta, M_MD_PROMISE);
1022                                 pv->pv_meta = promise_meta_copy(pdmeta);
1023                                 pv->pv_generation = pv->pv_meta->generation;
1024                                 pv->pv_disks_present = 1;
1025                         } else if (pdmeta->generation == pv->pv_generation) {
1026                                 pv->pv_disks_present++;
1027                                 G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)",
1028                                     pv->pv_disks_present,
1029                                     pv->pv_meta->total_disks);
1030                         } else {
1031                                 G_RAID_DEBUG1(1, sc, "Older disk");
1032                         }
1033                 }
1034         }
1035
1036         for (i = 0; i < pd->pd_subdisks; i++) {
1037                 pdmeta = pd->pd_meta[i];
1038
1039                 /* Look for volume with matching ID. */
1040                 vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id);
1041                 if (vol == NULL)
1042                         continue;
1043                 pv = vol->v_md_data;
1044
1045                 if (pv->pv_started) {
1046                         if (g_raid_md_promise_start_disk(disk, i, vol))
1047                                 g_raid_md_write_promise(md, vol, NULL, NULL);
1048                 } else {
1049                         /* If we collected all needed disks - start array. */
1050                         if (pv->pv_disks_present == pv->pv_meta->total_disks)
1051                                 g_raid_md_promise_start(vol);
1052                 }
1053         }
1054 }
1055
1056 static int
1057 g_raid_md_create_promise(struct g_raid_md_object *md, struct g_class *mp,
1058     struct g_geom **gp)
1059 {
1060         struct g_geom *geom;
1061         struct g_raid_softc *sc;
1062
1063         /* Search for existing node. */
1064         LIST_FOREACH(geom, &mp->geom, geom) {
1065                 sc = geom->softc;
1066                 if (sc == NULL)
1067                         continue;
1068                 if (sc->sc_stopping != 0)
1069                         continue;
1070                 if (sc->sc_md->mdo_class != md->mdo_class)
1071                         continue;
1072                 break;
1073         }
1074         if (geom != NULL) {
1075                 *gp = geom;
1076                 return (G_RAID_MD_TASTE_EXISTING);
1077         }
1078
1079         /* Create new one if not found. */
1080         sc = g_raid_create_node(mp, "Promise", md);
1081         if (sc == NULL)
1082                 return (G_RAID_MD_TASTE_FAIL);
1083         md->mdo_softc = sc;
1084         *gp = sc->sc_geom;
1085         return (G_RAID_MD_TASTE_NEW);
1086 }
1087
1088 static int
1089 g_raid_md_taste_promise(struct g_raid_md_object *md, struct g_class *mp,
1090                               struct g_consumer *cp, struct g_geom **gp)
1091 {
1092         struct g_consumer *rcp;
1093         struct g_provider *pp;
1094         struct g_raid_softc *sc;
1095         struct g_raid_disk *disk;
1096         struct promise_raid_conf *meta, *metaarr[4];
1097         struct g_raid_md_promise_perdisk *pd;
1098         struct g_geom *geom;
1099         int i, j, result, len, subdisks;
1100         char name[16];
1101         uint16_t vendor;
1102
1103         G_RAID_DEBUG(1, "Tasting Promise on %s", cp->provider->name);
1104         pp = cp->provider;
1105
1106         /* Read metadata from device. */
1107         meta = NULL;
1108         vendor = 0xffff;
1109         if (g_access(cp, 1, 0, 0) != 0)
1110                 return (G_RAID_MD_TASTE_FAIL);
1111         g_topology_unlock();
1112         len = 2;
1113         if (pp->geom->rank == 1)
1114                 g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor);
1115         subdisks = promise_meta_read(cp, metaarr);
1116         g_topology_lock();
1117         g_access(cp, -1, 0, 0);
1118         if (subdisks == 0) {
1119                 if (g_raid_aggressive_spare) {
1120                         if (vendor == 0x105a || vendor == 0x1002) {
1121                                 G_RAID_DEBUG(1,
1122                                     "No Promise metadata, forcing spare.");
1123                                 goto search;
1124                         } else {
1125                                 G_RAID_DEBUG(1,
1126                                     "Promise/ATI vendor mismatch "
1127                                     "0x%04x != 0x105a/0x1002",
1128                                     vendor);
1129                         }
1130                 }
1131                 return (G_RAID_MD_TASTE_FAIL);
1132         }
1133
1134         /* Metadata valid. Print it. */
1135         for (i = 0; i < subdisks; i++)
1136                 g_raid_md_promise_print(metaarr[i]);
1137
1138         /* Purge meaningless (empty/spare) records. */
1139         for (i = 0; i < subdisks; ) {
1140                 if (metaarr[i]->disk.flags & PROMISE_F_ASSIGNED) {
1141                         i++;
1142                         continue;
1143                 }
1144                 free(metaarr[i], M_MD_PROMISE);
1145                 for (j = i; j < subdisks - 1; j++)
1146                         metaarr[i] = metaarr[j + 1];
1147                 metaarr[subdisks - 1] = NULL;
1148                 subdisks--;
1149         }
1150
1151 search:
1152         /* Search for matching node. */
1153         sc = NULL;
1154         LIST_FOREACH(geom, &mp->geom, geom) {
1155                 sc = geom->softc;
1156                 if (sc == NULL)
1157                         continue;
1158                 if (sc->sc_stopping != 0)
1159                         continue;
1160                 if (sc->sc_md->mdo_class != md->mdo_class)
1161                         continue;
1162                 break;
1163         }
1164
1165         /* Found matching node. */
1166         if (geom != NULL) {
1167                 G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name);
1168                 result = G_RAID_MD_TASTE_EXISTING;
1169
1170         } else { /* Not found matching node -- create one. */
1171                 result = G_RAID_MD_TASTE_NEW;
1172                 snprintf(name, sizeof(name), "Promise");
1173                 sc = g_raid_create_node(mp, name, md);
1174                 md->mdo_softc = sc;
1175                 geom = sc->sc_geom;
1176         }
1177
1178         rcp = g_new_consumer(geom);
1179         g_attach(rcp, pp);
1180         if (g_access(rcp, 1, 1, 1) != 0)
1181                 ; //goto fail1;
1182
1183         g_topology_unlock();
1184         sx_xlock(&sc->sc_lock);
1185
1186         pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO);
1187         pd->pd_subdisks = subdisks;
1188         for (i = 0; i < subdisks; i++)
1189                 pd->pd_meta[i] = metaarr[i];
1190         disk = g_raid_create_disk(sc);
1191         disk->d_md_data = (void *)pd;
1192         disk->d_consumer = rcp;
1193         rcp->private = disk;
1194
1195         g_raid_get_disk_info(disk);
1196
1197         g_raid_md_promise_new_disk(disk);
1198
1199         sx_xunlock(&sc->sc_lock);
1200         g_topology_lock();
1201         *gp = geom;
1202         return (result);
1203 }
1204
1205 static int
1206 g_raid_md_event_promise(struct g_raid_md_object *md,
1207     struct g_raid_disk *disk, u_int event)
1208 {
1209         struct g_raid_softc *sc;
1210
1211         sc = md->mdo_softc;
1212         if (disk == NULL)
1213                 return (-1);
1214         switch (event) {
1215         case G_RAID_DISK_E_DISCONNECTED:
1216                 /* Delete disk. */
1217                 g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
1218                 g_raid_destroy_disk(disk);
1219                 g_raid_md_promise_purge_volumes(sc);
1220
1221                 /* Write updated metadata to all disks. */
1222                 g_raid_md_write_promise(md, NULL, NULL, NULL);
1223
1224                 /* Check if anything left. */
1225                 if (g_raid_ndisks(sc, -1) == 0)
1226                         g_raid_destroy_node(sc, 0);
1227                 else
1228                         g_raid_md_promise_refill(sc);
1229                 return (0);
1230         }
1231         return (-2);
1232 }
1233
1234 static int
1235 g_raid_md_volume_event_promise(struct g_raid_md_object *md,
1236     struct g_raid_volume *vol, u_int event)
1237 {
1238         struct g_raid_md_promise_pervolume *pv;
1239
1240         pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data;
1241         switch (event) {
1242         case G_RAID_VOLUME_E_STARTMD:
1243                 if (!pv->pv_started)
1244                         g_raid_md_promise_start(vol);
1245                 return (0);
1246         }
1247         return (-2);
1248 }
1249
1250 static int
1251 g_raid_md_ctl_promise(struct g_raid_md_object *md,
1252     struct gctl_req *req)
1253 {
1254         struct g_raid_softc *sc;
1255         struct g_raid_volume *vol, *vol1;
1256         struct g_raid_subdisk *sd;
1257         struct g_raid_disk *disk, *disks[PROMISE_MAX_DISKS];
1258         struct g_raid_md_promise_perdisk *pd;
1259         struct g_raid_md_promise_pervolume *pv;
1260         struct g_consumer *cp;
1261         struct g_provider *pp;
1262         char arg[16];
1263         const char *nodename, *verb, *volname, *levelname, *diskname;
1264         char *tmp;
1265         int *nargs, *force;
1266         off_t esize, offs[PROMISE_MAX_DISKS], size, sectorsize, strip;
1267         intmax_t *sizearg, *striparg;
1268         int numdisks, i, len, level, qual;
1269         int error;
1270
1271         sc = md->mdo_softc;
1272         verb = gctl_get_param(req, "verb", NULL);
1273         nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
1274         error = 0;
1275         if (strcmp(verb, "label") == 0) {
1276
1277                 if (*nargs < 4) {
1278                         gctl_error(req, "Invalid number of arguments.");
1279                         return (-1);
1280                 }
1281                 volname = gctl_get_asciiparam(req, "arg1");
1282                 if (volname == NULL) {
1283                         gctl_error(req, "No volume name.");
1284                         return (-2);
1285                 }
1286                 levelname = gctl_get_asciiparam(req, "arg2");
1287                 if (levelname == NULL) {
1288                         gctl_error(req, "No RAID level.");
1289                         return (-3);
1290                 }
1291                 if (strcasecmp(levelname, "RAID5") == 0)
1292                         levelname = "RAID5-LA";
1293                 if (g_raid_volume_str2level(levelname, &level, &qual)) {
1294                         gctl_error(req, "Unknown RAID level '%s'.", levelname);
1295                         return (-4);
1296                 }
1297                 numdisks = *nargs - 3;
1298                 force = gctl_get_paraml(req, "force", sizeof(*force));
1299                 if (!g_raid_md_promise_supported(level, qual, numdisks,
1300                     force ? *force : 0)) {
1301                         gctl_error(req, "Unsupported RAID level "
1302                             "(0x%02x/0x%02x), or number of disks (%d).",
1303                             level, qual, numdisks);
1304                         return (-5);
1305                 }
1306
1307                 /* Search for disks, connect them and probe. */
1308                 size = INT64_MAX;
1309                 sectorsize = 0;
1310                 bzero(disks, sizeof(disks));
1311                 bzero(offs, sizeof(offs));
1312                 for (i = 0; i < numdisks; i++) {
1313                         snprintf(arg, sizeof(arg), "arg%d", i + 3);
1314                         diskname = gctl_get_asciiparam(req, arg);
1315                         if (diskname == NULL) {
1316                                 gctl_error(req, "No disk name (%s).", arg);
1317                                 error = -6;
1318                                 break;
1319                         }
1320                         if (strcmp(diskname, "NONE") == 0)
1321                                 continue;
1322
1323                         TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1324                                 if (disk->d_consumer != NULL && 
1325                                     disk->d_consumer->provider != NULL &&
1326                                     strcmp(disk->d_consumer->provider->name,
1327                                      diskname) == 0)
1328                                         break;
1329                         }
1330                         if (disk != NULL) {
1331                                 if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
1332                                         gctl_error(req, "Disk '%s' is in a "
1333                                             "wrong state (%s).", diskname,
1334                                             g_raid_disk_state2str(disk->d_state));
1335                                         error = -7;
1336                                         break;
1337                                 }
1338                                 pd = disk->d_md_data;
1339                                 if (pd->pd_subdisks >= PROMISE_MAX_SUBDISKS) {
1340                                         gctl_error(req, "Disk '%s' already "
1341                                             "used by %d volumes.",
1342                                             diskname, pd->pd_subdisks);
1343                                         error = -7;
1344                                         break;
1345                                 }
1346                                 pp = disk->d_consumer->provider;
1347                                 disks[i] = disk;
1348                                 promise_meta_unused_range(pd->pd_meta,
1349                                     pd->pd_subdisks,
1350                                     pp->mediasize / pp->sectorsize,
1351                                     &offs[i], &esize);
1352                                 size = MIN(size, (off_t)esize * pp->sectorsize);
1353                                 sectorsize = MAX(sectorsize, pp->sectorsize);
1354                                 continue;
1355                         }
1356
1357                         g_topology_lock();
1358                         cp = g_raid_open_consumer(sc, diskname);
1359                         if (cp == NULL) {
1360                                 gctl_error(req, "Can't open disk '%s'.",
1361                                     diskname);
1362                                 g_topology_unlock();
1363                                 error = -8;
1364                                 break;
1365                         }
1366                         pp = cp->provider;
1367                         pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO);
1368                         disk = g_raid_create_disk(sc);
1369                         disk->d_md_data = (void *)pd;
1370                         disk->d_consumer = cp;
1371                         disks[i] = disk;
1372                         cp->private = disk;
1373                         g_topology_unlock();
1374
1375                         g_raid_get_disk_info(disk);
1376
1377                         /* Reserve some space for metadata. */
1378                         size = MIN(size, pp->mediasize - 131072llu * pp->sectorsize);
1379                         sectorsize = MAX(sectorsize, pp->sectorsize);
1380                 }
1381                 if (error != 0) {
1382                         for (i = 0; i < numdisks; i++) {
1383                                 if (disks[i] != NULL &&
1384                                     disks[i]->d_state == G_RAID_DISK_S_NONE)
1385                                         g_raid_destroy_disk(disks[i]);
1386                         }
1387                         return (error);
1388                 }
1389
1390                 if (sectorsize <= 0) {
1391                         gctl_error(req, "Can't get sector size.");
1392                         return (-8);
1393                 }
1394
1395                 /* Handle size argument. */
1396                 len = sizeof(*sizearg);
1397                 sizearg = gctl_get_param(req, "size", &len);
1398                 if (sizearg != NULL && len == sizeof(*sizearg) &&
1399                     *sizearg > 0) {
1400                         if (*sizearg > size) {
1401                                 gctl_error(req, "Size too big %lld > %lld.",
1402                                     (long long)*sizearg, (long long)size);
1403                                 return (-9);
1404                         }
1405                         size = *sizearg;
1406                 }
1407
1408                 /* Handle strip argument. */
1409                 strip = 131072;
1410                 len = sizeof(*striparg);
1411                 striparg = gctl_get_param(req, "strip", &len);
1412                 if (striparg != NULL && len == sizeof(*striparg) &&
1413                     *striparg > 0) {
1414                         if (*striparg < sectorsize) {
1415                                 gctl_error(req, "Strip size too small.");
1416                                 return (-10);
1417                         }
1418                         if (*striparg % sectorsize != 0) {
1419                                 gctl_error(req, "Incorrect strip size.");
1420                                 return (-11);
1421                         }
1422                         strip = *striparg;
1423                 }
1424
1425                 /* Round size down to strip or sector. */
1426                 if (level == G_RAID_VOLUME_RL_RAID1 ||
1427                     level == G_RAID_VOLUME_RL_SINGLE ||
1428                     level == G_RAID_VOLUME_RL_CONCAT)
1429                         size -= (size % sectorsize);
1430                 else if (level == G_RAID_VOLUME_RL_RAID1E &&
1431                     (numdisks & 1) != 0)
1432                         size -= (size % (2 * strip));
1433                 else
1434                         size -= (size % strip);
1435                 if (size <= 0) {
1436                         gctl_error(req, "Size too small.");
1437                         return (-13);
1438                 }
1439
1440                 /* We have all we need, create things: volume, ... */
1441                 pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK | M_ZERO);
1442                 arc4rand(&pv->pv_id, sizeof(pv->pv_id), 0);
1443                 pv->pv_generation = 0;
1444                 pv->pv_started = 1;
1445                 vol = g_raid_create_volume(sc, volname, -1);
1446                 vol->v_md_data = pv;
1447                 vol->v_raid_level = level;
1448                 vol->v_raid_level_qualifier = qual;
1449                 vol->v_strip_size = strip;
1450                 vol->v_disks_count = numdisks;
1451                 if (level == G_RAID_VOLUME_RL_RAID0 ||
1452                     level == G_RAID_VOLUME_RL_CONCAT ||
1453                     level == G_RAID_VOLUME_RL_SINGLE)
1454                         vol->v_mediasize = size * numdisks;
1455                 else if (level == G_RAID_VOLUME_RL_RAID1)
1456                         vol->v_mediasize = size;
1457                 else if (level == G_RAID_VOLUME_RL_RAID3 ||
1458                     level == G_RAID_VOLUME_RL_RAID5)
1459                         vol->v_mediasize = size * (numdisks - 1);
1460                 else { /* RAID1E */
1461                         vol->v_mediasize = ((size * numdisks) / strip / 2) *
1462                             strip;
1463                 }
1464                 vol->v_sectorsize = sectorsize;
1465                 g_raid_start_volume(vol);
1466
1467                 /* , and subdisks. */
1468                 for (i = 0; i < numdisks; i++) {
1469                         disk = disks[i];
1470                         sd = &vol->v_subdisks[i];
1471                         sd->sd_disk = disk;
1472                         sd->sd_offset = (off_t)offs[i] * 512;
1473                         sd->sd_size = size;
1474                         if (disk == NULL)
1475                                 continue;
1476                         TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
1477                         g_raid_change_disk_state(disk,
1478                             G_RAID_DISK_S_ACTIVE);
1479                         g_raid_change_subdisk_state(sd,
1480                             G_RAID_SUBDISK_S_ACTIVE);
1481                         g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
1482                             G_RAID_EVENT_SUBDISK);
1483                 }
1484
1485                 /* Write metadata based on created entities. */
1486                 G_RAID_DEBUG1(0, sc, "Array started.");
1487                 g_raid_md_write_promise(md, vol, NULL, NULL);
1488
1489                 /* Pickup any STALE/SPARE disks to refill array if needed. */
1490                 g_raid_md_promise_refill(sc);
1491
1492                 g_raid_event_send(vol, G_RAID_VOLUME_E_START,
1493                     G_RAID_EVENT_VOLUME);
1494                 return (0);
1495         }
1496         if (strcmp(verb, "add") == 0) {
1497
1498                 gctl_error(req, "`add` command is not applicable, "
1499                     "use `label` instead.");
1500                 return (-99);
1501         }
1502         if (strcmp(verb, "delete") == 0) {
1503
1504                 nodename = gctl_get_asciiparam(req, "arg0");
1505                 if (nodename != NULL && strcasecmp(sc->sc_name, nodename) != 0)
1506                         nodename = NULL;
1507
1508                 /* Full node destruction. */
1509                 if (*nargs == 1 && nodename != NULL) {
1510                         /* Check if some volume is still open. */
1511                         force = gctl_get_paraml(req, "force", sizeof(*force));
1512                         if (force != NULL && *force == 0 &&
1513                             g_raid_nopens(sc) != 0) {
1514                                 gctl_error(req, "Some volume is still open.");
1515                                 return (-4);
1516                         }
1517
1518                         TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1519                                 if (disk->d_consumer)
1520                                         promise_meta_erase(disk->d_consumer);
1521                         }
1522                         g_raid_destroy_node(sc, 0);
1523                         return (0);
1524                 }
1525
1526                 /* Destroy specified volume. If it was last - all node. */
1527                 if (*nargs > 2) {
1528                         gctl_error(req, "Invalid number of arguments.");
1529                         return (-1);
1530                 }
1531                 volname = gctl_get_asciiparam(req,
1532                     nodename != NULL ? "arg1" : "arg0");
1533                 if (volname == NULL) {
1534                         gctl_error(req, "No volume name.");
1535                         return (-2);
1536                 }
1537
1538                 /* Search for volume. */
1539                 TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1540                         if (strcmp(vol->v_name, volname) == 0)
1541                                 break;
1542                         pp = vol->v_provider;
1543                         if (pp == NULL)
1544                                 continue;
1545                         if (strcmp(pp->name, volname) == 0)
1546                                 break;
1547                         if (strncmp(pp->name, "raid/", 5) == 0 &&
1548                             strcmp(pp->name + 5, volname) == 0)
1549                                 break;
1550                 }
1551                 if (vol == NULL) {
1552                         i = strtol(volname, &tmp, 10);
1553                         if (verb != volname && tmp[0] == 0) {
1554                                 TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1555                                         if (vol->v_global_id == i)
1556                                                 break;
1557                                 }
1558                         }
1559                 }
1560                 if (vol == NULL) {
1561                         gctl_error(req, "Volume '%s' not found.", volname);
1562                         return (-3);
1563                 }
1564
1565                 /* Check if volume is still open. */
1566                 force = gctl_get_paraml(req, "force", sizeof(*force));
1567                 if (force != NULL && *force == 0 &&
1568                     vol->v_provider_open != 0) {
1569                         gctl_error(req, "Volume is still open.");
1570                         return (-4);
1571                 }
1572
1573                 /* Destroy volume and potentially node. */
1574                 i = 0;
1575                 TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next)
1576                         i++;
1577                 if (i >= 2) {
1578                         g_raid_destroy_volume(vol);
1579                         g_raid_md_promise_purge_disks(sc);
1580                         g_raid_md_write_promise(md, NULL, NULL, NULL);
1581                 } else {
1582                         TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1583                                 if (disk->d_consumer)
1584                                         promise_meta_erase(disk->d_consumer);
1585                         }
1586                         g_raid_destroy_node(sc, 0);
1587                 }
1588                 return (0);
1589         }
1590         if (strcmp(verb, "remove") == 0 ||
1591             strcmp(verb, "fail") == 0) {
1592                 if (*nargs < 2) {
1593                         gctl_error(req, "Invalid number of arguments.");
1594                         return (-1);
1595                 }
1596                 for (i = 1; i < *nargs; i++) {
1597                         snprintf(arg, sizeof(arg), "arg%d", i);
1598                         diskname = gctl_get_asciiparam(req, arg);
1599                         if (diskname == NULL) {
1600                                 gctl_error(req, "No disk name (%s).", arg);
1601                                 error = -2;
1602                                 break;
1603                         }
1604                         if (strncmp(diskname, "/dev/", 5) == 0)
1605                                 diskname += 5;
1606
1607                         TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1608                                 if (disk->d_consumer != NULL && 
1609                                     disk->d_consumer->provider != NULL &&
1610                                     strcmp(disk->d_consumer->provider->name,
1611                                      diskname) == 0)
1612                                         break;
1613                         }
1614                         if (disk == NULL) {
1615                                 gctl_error(req, "Disk '%s' not found.",
1616                                     diskname);
1617                                 error = -3;
1618                                 break;
1619                         }
1620
1621                         if (strcmp(verb, "fail") == 0) {
1622                                 g_raid_md_fail_disk_promise(md, NULL, disk);
1623                                 continue;
1624                         }
1625
1626                         /* Erase metadata on deleting disk and destroy it. */
1627                         promise_meta_erase(disk->d_consumer);
1628                         g_raid_destroy_disk(disk);
1629                 }
1630                 g_raid_md_promise_purge_volumes(sc);
1631
1632                 /* Write updated metadata to remaining disks. */
1633                 g_raid_md_write_promise(md, NULL, NULL, NULL);
1634
1635                 /* Check if anything left. */
1636                 if (g_raid_ndisks(sc, -1) == 0)
1637                         g_raid_destroy_node(sc, 0);
1638                 else
1639                         g_raid_md_promise_refill(sc);
1640                 return (error);
1641         }
1642         if (strcmp(verb, "insert") == 0) {
1643                 if (*nargs < 2) {
1644                         gctl_error(req, "Invalid number of arguments.");
1645                         return (-1);
1646                 }
1647                 for (i = 1; i < *nargs; i++) {
1648                         /* Get disk name. */
1649                         snprintf(arg, sizeof(arg), "arg%d", i);
1650                         diskname = gctl_get_asciiparam(req, arg);
1651                         if (diskname == NULL) {
1652                                 gctl_error(req, "No disk name (%s).", arg);
1653                                 error = -3;
1654                                 break;
1655                         }
1656
1657                         /* Try to find provider with specified name. */
1658                         g_topology_lock();
1659                         cp = g_raid_open_consumer(sc, diskname);
1660                         if (cp == NULL) {
1661                                 gctl_error(req, "Can't open disk '%s'.",
1662                                     diskname);
1663                                 g_topology_unlock();
1664                                 error = -4;
1665                                 break;
1666                         }
1667                         pp = cp->provider;
1668                         g_topology_unlock();
1669
1670                         pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO);
1671
1672                         disk = g_raid_create_disk(sc);
1673                         disk->d_consumer = cp;
1674                         disk->d_md_data = (void *)pd;
1675                         cp->private = disk;
1676
1677                         g_raid_get_disk_info(disk);
1678
1679                         /* Welcome the "new" disk. */
1680                         g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
1681                         promise_meta_write_spare(cp);
1682                         g_raid_md_promise_refill(sc);
1683                 }
1684                 return (error);
1685         }
1686         return (-100);
1687 }
1688
1689 static int
1690 g_raid_md_write_promise(struct g_raid_md_object *md, struct g_raid_volume *tvol,
1691     struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
1692 {
1693         struct g_raid_softc *sc;
1694         struct g_raid_volume *vol;
1695         struct g_raid_subdisk *sd;
1696         struct g_raid_disk *disk;
1697         struct g_raid_md_promise_perdisk *pd;
1698         struct g_raid_md_promise_pervolume *pv;
1699         struct promise_raid_conf *meta;
1700         off_t rebuild_lba64;
1701         int i, j, pos, rebuild;
1702
1703         sc = md->mdo_softc;
1704
1705         if (sc->sc_stopping == G_RAID_DESTROY_HARD)
1706                 return (0);
1707
1708         /* Generate new per-volume metadata for affected volumes. */
1709         TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1710                 if (vol->v_stopping)
1711                         continue;
1712
1713                 /* Skip volumes not related to specified targets. */
1714                 if (tvol != NULL && vol != tvol)
1715                         continue;
1716                 if (tsd != NULL && vol != tsd->sd_volume)
1717                         continue;
1718                 if (tdisk != NULL) {
1719                         for (i = 0; i < vol->v_disks_count; i++) {
1720                                 if (vol->v_subdisks[i].sd_disk == tdisk)
1721                                         break;
1722                         }
1723                         if (i >= vol->v_disks_count)
1724                                 continue;
1725                 }
1726
1727                 pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data;
1728                 pv->pv_generation++;
1729
1730                 meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK | M_ZERO);
1731                 if (pv->pv_meta != NULL)
1732                         memcpy(meta, pv->pv_meta, sizeof(*meta));
1733                 memcpy(meta->promise_id, PROMISE_MAGIC,
1734                     sizeof(PROMISE_MAGIC) - 1);
1735                 meta->dummy_0 = 0x00020000;
1736                 meta->integrity = PROMISE_I_VALID;
1737
1738                 meta->generation = pv->pv_generation;
1739                 meta->status = PROMISE_S_VALID | PROMISE_S_ONLINE |
1740                     PROMISE_S_INITED | PROMISE_S_READY;
1741                 if (vol->v_state <= G_RAID_VOLUME_S_DEGRADED)
1742                         meta->status |= PROMISE_S_DEGRADED;
1743                 if (vol->v_dirty)
1744                         meta->status |= PROMISE_S_MARKED; /* XXX: INVENTED! */
1745                 if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0 ||
1746                     vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE)
1747                         meta->type = PROMISE_T_RAID0;
1748                 else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
1749                     vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
1750                         meta->type = PROMISE_T_RAID1;
1751                 else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3)
1752                         meta->type = PROMISE_T_RAID3;
1753                 else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5)
1754                         meta->type = PROMISE_T_RAID5;
1755                 else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT)
1756                         meta->type = PROMISE_T_SPAN;
1757                 else
1758                         meta->type = PROMISE_T_JBOD;
1759                 meta->total_disks = vol->v_disks_count;
1760                 meta->stripe_shift = ffs(vol->v_strip_size / 1024);
1761                 meta->array_width = vol->v_disks_count;
1762                 if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
1763                     vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
1764                         meta->array_width /= 2;
1765                 meta->array_number = vol->v_global_id;
1766                 meta->total_sectors = vol->v_mediasize / 512;
1767                 meta->total_sectors_high = (vol->v_mediasize / 512) >> 32;
1768                 meta->sector_size = vol->v_sectorsize / 512;
1769                 meta->cylinders = meta->total_sectors / (255 * 63) - 1;
1770                 meta->heads = 254;
1771                 meta->sectors = 63;
1772                 meta->volume_id = pv->pv_id;
1773                 rebuild_lba64 = UINT64_MAX;
1774                 rebuild = 0;
1775                 for (i = 0; i < vol->v_disks_count; i++) {
1776                         sd = &vol->v_subdisks[i];
1777                         /* For RAID0+1 we need to translate order. */
1778                         pos = promise_meta_translate_disk(vol, i);
1779                         meta->disks[pos].flags = PROMISE_F_VALID |
1780                             PROMISE_F_ASSIGNED;
1781                         if (sd->sd_state == G_RAID_SUBDISK_S_NONE) {
1782                                 meta->disks[pos].flags |= 0;
1783                         } else if (sd->sd_state == G_RAID_SUBDISK_S_FAILED) {
1784                                 meta->disks[pos].flags |=
1785                                     PROMISE_F_DOWN | PROMISE_F_REDIR;
1786                         } else if (sd->sd_state <= G_RAID_SUBDISK_S_REBUILD) {
1787                                 meta->disks[pos].flags |=
1788                                     PROMISE_F_ONLINE | PROMISE_F_REDIR;
1789                                 if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) {
1790                                         rebuild_lba64 = MIN(rebuild_lba64,
1791                                             sd->sd_rebuild_pos / 512);
1792                                 } else
1793                                         rebuild_lba64 = 0;
1794                                 rebuild = 1;
1795                         } else {
1796                                 meta->disks[pos].flags |= PROMISE_F_ONLINE;
1797                                 if (sd->sd_state < G_RAID_SUBDISK_S_ACTIVE) {
1798                                         meta->status |= PROMISE_S_MARKED;
1799                                         if (sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
1800                                                 rebuild_lba64 = MIN(rebuild_lba64,
1801                                                     sd->sd_rebuild_pos / 512);
1802                                         } else
1803                                                 rebuild_lba64 = 0;
1804                                 }
1805                         }
1806                         if (pv->pv_meta != NULL) {
1807                                 meta->disks[pos].id = pv->pv_meta->disks[pos].id;
1808                         } else {
1809                                 meta->disks[pos].number = i * 2;
1810                                 arc4rand(&meta->disks[pos].id,
1811                                     sizeof(meta->disks[pos].id), 0);
1812                         }
1813                 }
1814                 promise_meta_put_name(meta, vol->v_name);
1815
1816                 /* Try to mimic AMD BIOS rebuild/resync behavior. */
1817                 if (rebuild_lba64 != UINT64_MAX) {
1818                         if (rebuild)
1819                                 meta->magic_3 = 0x03040010UL; /* Rebuild? */
1820                         else
1821                                 meta->magic_3 = 0x03040008UL; /* Resync? */
1822                         /* Translate from per-disk to per-volume LBA. */
1823                         if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
1824                             vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) {
1825                                 rebuild_lba64 *= meta->array_width;
1826                         } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 ||
1827                             vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) {
1828                                 rebuild_lba64 *= meta->array_width - 1;
1829                         } else
1830                                 rebuild_lba64 = 0;
1831                 } else
1832                         meta->magic_3 = 0x03000000UL;
1833                 meta->rebuild_lba64 = rebuild_lba64;
1834                 meta->magic_4 = 0x04010101UL;
1835
1836                 /* Replace per-volume metadata with new. */
1837                 if (pv->pv_meta != NULL)
1838                         free(pv->pv_meta, M_MD_PROMISE);
1839                 pv->pv_meta = meta;
1840
1841                 /* Copy new metadata to the disks, adding or replacing old. */
1842                 for (i = 0; i < vol->v_disks_count; i++) {
1843                         sd = &vol->v_subdisks[i];
1844                         disk = sd->sd_disk;
1845                         if (disk == NULL)
1846                                 continue;
1847                         /* For RAID0+1 we need to translate order. */
1848                         pos = promise_meta_translate_disk(vol, i);
1849                         pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
1850                         for (j = 0; j < pd->pd_subdisks; j++) {
1851                                 if (pd->pd_meta[j]->volume_id == meta->volume_id)
1852                                         break;
1853                         }
1854                         if (j == pd->pd_subdisks)
1855                                 pd->pd_subdisks++;
1856                         if (pd->pd_meta[j] != NULL)
1857                                 free(pd->pd_meta[j], M_MD_PROMISE);
1858                         pd->pd_meta[j] = promise_meta_copy(meta);
1859                         pd->pd_meta[j]->disk = meta->disks[pos];
1860                         pd->pd_meta[j]->disk.number = pos;
1861                         pd->pd_meta[j]->disk_offset_high =
1862                             (sd->sd_offset / 512) >> 32;
1863                         pd->pd_meta[j]->disk_offset = sd->sd_offset / 512;
1864                         pd->pd_meta[j]->disk_sectors_high =
1865                             (sd->sd_size / 512) >> 32;
1866                         pd->pd_meta[j]->disk_sectors = sd->sd_size / 512;
1867                         if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) {
1868                                 pd->pd_meta[j]->disk_rebuild_high =
1869                                     (sd->sd_rebuild_pos / 512) >> 32;
1870                                 pd->pd_meta[j]->disk_rebuild =
1871                                     sd->sd_rebuild_pos / 512;
1872                         } else if (sd->sd_state < G_RAID_SUBDISK_S_REBUILD) {
1873                                 pd->pd_meta[j]->disk_rebuild_high = 0;
1874                                 pd->pd_meta[j]->disk_rebuild = 0;
1875                         } else {
1876                                 pd->pd_meta[j]->disk_rebuild_high = UINT32_MAX;
1877                                 pd->pd_meta[j]->disk_rebuild = UINT32_MAX;
1878                         }
1879                         pd->pd_updated = 1;
1880                 }
1881         }
1882
1883         TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1884                 pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
1885                 if (disk->d_state != G_RAID_DISK_S_ACTIVE)
1886                         continue;
1887                 if (!pd->pd_updated)
1888                         continue;
1889                 G_RAID_DEBUG(1, "Writing Promise metadata to %s",
1890                     g_raid_get_diskname(disk));
1891                 for (i = 0; i < pd->pd_subdisks; i++)
1892                         g_raid_md_promise_print(pd->pd_meta[i]);
1893                 promise_meta_write(disk->d_consumer,
1894                     pd->pd_meta, pd->pd_subdisks);
1895                 pd->pd_updated = 0;
1896         }
1897
1898         return (0);
1899 }
1900
1901 static int
1902 g_raid_md_fail_disk_promise(struct g_raid_md_object *md,
1903     struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
1904 {
1905         struct g_raid_softc *sc;
1906         struct g_raid_md_promise_perdisk *pd;
1907         struct g_raid_subdisk *sd;
1908         int i, pos;
1909
1910         sc = md->mdo_softc;
1911         pd = (struct g_raid_md_promise_perdisk *)tdisk->d_md_data;
1912
1913         /* We can't fail disk that is not a part of array now. */
1914         if (tdisk->d_state != G_RAID_DISK_S_ACTIVE)
1915                 return (-1);
1916
1917         /*
1918          * Mark disk as failed in metadata and try to write that metadata
1919          * to the disk itself to prevent it's later resurrection as STALE.
1920          */
1921         if (pd->pd_subdisks > 0 && tdisk->d_consumer != NULL)
1922                 G_RAID_DEBUG(1, "Writing Promise metadata to %s",
1923                     g_raid_get_diskname(tdisk));
1924         for (i = 0; i < pd->pd_subdisks; i++) {
1925                 pd->pd_meta[i]->disk.flags |=
1926                     PROMISE_F_DOWN | PROMISE_F_REDIR;
1927                 pos = pd->pd_meta[i]->disk.number;
1928                 if (pos >= 0 && pos < PROMISE_MAX_DISKS) {
1929                         pd->pd_meta[i]->disks[pos].flags |=
1930                             PROMISE_F_DOWN | PROMISE_F_REDIR;
1931                 }
1932                 g_raid_md_promise_print(pd->pd_meta[i]);
1933         }
1934         if (tdisk->d_consumer != NULL)
1935                 promise_meta_write(tdisk->d_consumer,
1936                     pd->pd_meta, pd->pd_subdisks);
1937
1938         /* Change states. */
1939         g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED);
1940         TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) {
1941                 g_raid_change_subdisk_state(sd,
1942                     G_RAID_SUBDISK_S_FAILED);
1943                 g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED,
1944                     G_RAID_EVENT_SUBDISK);
1945         }
1946
1947         /* Write updated metadata to remaining disks. */
1948         g_raid_md_write_promise(md, NULL, NULL, tdisk);
1949
1950         g_raid_md_promise_refill(sc);
1951         return (0);
1952 }
1953
1954 static int
1955 g_raid_md_free_disk_promise(struct g_raid_md_object *md,
1956     struct g_raid_disk *disk)
1957 {
1958         struct g_raid_md_promise_perdisk *pd;
1959         int i;
1960
1961         pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
1962         for (i = 0; i < pd->pd_subdisks; i++) {
1963                 if (pd->pd_meta[i] != NULL) {
1964                         free(pd->pd_meta[i], M_MD_PROMISE);
1965                         pd->pd_meta[i] = NULL;
1966                 }
1967         }
1968         free(pd, M_MD_PROMISE);
1969         disk->d_md_data = NULL;
1970         return (0);
1971 }
1972
1973 static int
1974 g_raid_md_free_volume_promise(struct g_raid_md_object *md,
1975     struct g_raid_volume *vol)
1976 {
1977         struct g_raid_md_promise_pervolume *pv;
1978
1979         pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data;
1980         if (pv && pv->pv_meta != NULL) {
1981                 free(pv->pv_meta, M_MD_PROMISE);
1982                 pv->pv_meta = NULL;
1983         }
1984         if (pv && !pv->pv_started) {
1985                 pv->pv_started = 1;
1986                 callout_stop(&pv->pv_start_co);
1987         }
1988         free(pv, M_MD_PROMISE);
1989         vol->v_md_data = NULL;
1990         return (0);
1991 }
1992
1993 static int
1994 g_raid_md_free_promise(struct g_raid_md_object *md)
1995 {
1996
1997         return (0);
1998 }
1999
2000 G_RAID_MD_DECLARE(promise, "Promise");