]> CyberLeo.Net >> Repos - FreeBSD/releng/9.1.git/blob - sys/geom/raid/md_promise.c
MFC r240465:
[FreeBSD/releng/9.1.git] / sys / geom / raid / md_promise.c
1 /*-
2  * Copyright (c) 2011 Alexander Motin <mav@FreeBSD.org>
3  * Copyright (c) 2000 - 2008 Søren Schmidt <sos@FreeBSD.org>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30
31 #include <sys/param.h>
32 #include <sys/bio.h>
33 #include <sys/endian.h>
34 #include <sys/kernel.h>
35 #include <sys/kobj.h>
36 #include <sys/limits.h>
37 #include <sys/lock.h>
38 #include <sys/malloc.h>
39 #include <sys/mutex.h>
40 #include <sys/systm.h>
41 #include <geom/geom.h>
42 #include "geom/raid/g_raid.h"
43 #include "g_raid_md_if.h"
44
45 static MALLOC_DEFINE(M_MD_PROMISE, "md_promise_data", "GEOM_RAID Promise metadata");
46
47 #define PROMISE_MAX_DISKS       8
48 #define PROMISE_MAX_SUBDISKS    2
49 #define PROMISE_META_OFFSET     14
50
51 struct promise_raid_disk {
52         uint8_t         flags;                  /* Subdisk status. */
53 #define PROMISE_F_VALID         0x01
54 #define PROMISE_F_ONLINE        0x02
55 #define PROMISE_F_ASSIGNED      0x04
56 #define PROMISE_F_SPARE         0x08
57 #define PROMISE_F_DUPLICATE     0x10
58 #define PROMISE_F_REDIR         0x20
59 #define PROMISE_F_DOWN          0x40
60 #define PROMISE_F_READY         0x80
61
62         uint8_t         number;                 /* Position in a volume. */
63         uint8_t         channel;                /* ATA channel number. */
64         uint8_t         device;                 /* ATA device number. */
65         uint64_t        id __packed;            /* Subdisk ID. */
66 } __packed;
67
68 struct promise_raid_conf {
69         char            promise_id[24];
70 #define PROMISE_MAGIC           "Promise Technology, Inc."
71 #define FREEBSD_MAGIC           "FreeBSD ATA driver RAID "
72
73         uint32_t        dummy_0;
74         uint64_t        magic_0;
75 #define PROMISE_MAGIC0(x)       (((uint64_t)(x.channel) << 48) | \
76                                 ((uint64_t)(x.device != 0) << 56))
77         uint16_t        magic_1;
78         uint32_t        magic_2;
79         uint8_t         filler1[470];
80
81         uint32_t        integrity;
82 #define PROMISE_I_VALID         0x00000080
83
84         struct promise_raid_disk        disk;   /* This subdisk info. */
85         uint32_t        disk_offset;            /* Subdisk offset. */
86         uint32_t        disk_sectors;           /* Subdisk size */
87         uint32_t        rebuild_lba;            /* Rebuild position. */
88         uint16_t        generation;             /* Generation number. */
89         uint8_t         status;                 /* Volume status. */
90 #define PROMISE_S_VALID         0x01
91 #define PROMISE_S_ONLINE        0x02
92 #define PROMISE_S_INITED        0x04
93 #define PROMISE_S_READY         0x08
94 #define PROMISE_S_DEGRADED      0x10
95 #define PROMISE_S_MARKED        0x20
96 #define PROMISE_S_MIGRATING     0x40
97 #define PROMISE_S_FUNCTIONAL    0x80
98
99         uint8_t         type;                   /* Voluem type. */
100 #define PROMISE_T_RAID0         0x00
101 #define PROMISE_T_RAID1         0x01
102 #define PROMISE_T_RAID3         0x02
103 #define PROMISE_T_RAID5         0x04
104 #define PROMISE_T_SPAN          0x08
105 #define PROMISE_T_JBOD          0x10
106
107         uint8_t         total_disks;            /* Disks in this volume. */
108         uint8_t         stripe_shift;           /* Strip size. */
109         uint8_t         array_width;            /* Number of RAID0 stripes. */
110         uint8_t         array_number;           /* Global volume number. */
111         uint32_t        total_sectors;          /* Volume size. */
112         uint16_t        cylinders;              /* Volume geometry: C. */
113         uint8_t         heads;                  /* Volume geometry: H. */
114         uint8_t         sectors;                /* Volume geometry: S. */
115         uint64_t        volume_id __packed;     /* Volume ID, */
116         struct promise_raid_disk        disks[PROMISE_MAX_DISKS];
117                                                 /* Subdisks in this volume. */
118         char            name[32];               /* Volume label. */
119
120         uint32_t        filler2[8];
121         uint32_t        magic_3;        /* Something related to rebuild. */
122         uint64_t        rebuild_lba64;  /* Per-volume rebuild position. */
123         uint32_t        magic_4;
124         uint32_t        magic_5;
125         uint32_t        total_sectors_high;
126         uint32_t        filler3[324];
127         uint32_t        checksum;
128 } __packed;
129
130 struct g_raid_md_promise_perdisk {
131         int              pd_updated;
132         int              pd_subdisks;
133         struct promise_raid_conf        *pd_meta[PROMISE_MAX_SUBDISKS];
134 };
135
136 struct g_raid_md_promise_pervolume {
137         struct promise_raid_conf        *pv_meta;
138         uint64_t                         pv_id;
139         uint16_t                         pv_generation;
140         int                              pv_disks_present;
141         int                              pv_started;
142         struct callout                   pv_start_co;   /* STARTING state timer. */
143 };
144
145 static g_raid_md_create_t g_raid_md_create_promise;
146 static g_raid_md_taste_t g_raid_md_taste_promise;
147 static g_raid_md_event_t g_raid_md_event_promise;
148 static g_raid_md_volume_event_t g_raid_md_volume_event_promise;
149 static g_raid_md_ctl_t g_raid_md_ctl_promise;
150 static g_raid_md_write_t g_raid_md_write_promise;
151 static g_raid_md_fail_disk_t g_raid_md_fail_disk_promise;
152 static g_raid_md_free_disk_t g_raid_md_free_disk_promise;
153 static g_raid_md_free_volume_t g_raid_md_free_volume_promise;
154 static g_raid_md_free_t g_raid_md_free_promise;
155
156 static kobj_method_t g_raid_md_promise_methods[] = {
157         KOBJMETHOD(g_raid_md_create,    g_raid_md_create_promise),
158         KOBJMETHOD(g_raid_md_taste,     g_raid_md_taste_promise),
159         KOBJMETHOD(g_raid_md_event,     g_raid_md_event_promise),
160         KOBJMETHOD(g_raid_md_volume_event,      g_raid_md_volume_event_promise),
161         KOBJMETHOD(g_raid_md_ctl,       g_raid_md_ctl_promise),
162         KOBJMETHOD(g_raid_md_write,     g_raid_md_write_promise),
163         KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_promise),
164         KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_promise),
165         KOBJMETHOD(g_raid_md_free_volume,       g_raid_md_free_volume_promise),
166         KOBJMETHOD(g_raid_md_free,      g_raid_md_free_promise),
167         { 0, 0 }
168 };
169
170 static struct g_raid_md_class g_raid_md_promise_class = {
171         "Promise",
172         g_raid_md_promise_methods,
173         sizeof(struct g_raid_md_object),
174         .mdc_enable = 1,
175         .mdc_priority = 100
176 };
177
178
179 static void
180 g_raid_md_promise_print(struct promise_raid_conf *meta)
181 {
182         int i;
183
184         if (g_raid_debug < 1)
185                 return;
186
187         printf("********* ATA Promise Metadata *********\n");
188         printf("promise_id          <%.24s>\n", meta->promise_id);
189         printf("disk                %02x %02x %02x %02x %016jx\n",
190             meta->disk.flags, meta->disk.number, meta->disk.channel,
191             meta->disk.device, meta->disk.id);
192         printf("disk_offset         %u\n", meta->disk_offset);
193         printf("disk_sectors        %u\n", meta->disk_sectors);
194         printf("rebuild_lba         %u\n", meta->rebuild_lba);
195         printf("generation          %u\n", meta->generation);
196         printf("status              0x%02x\n", meta->status);
197         printf("type                %u\n", meta->type);
198         printf("total_disks         %u\n", meta->total_disks);
199         printf("stripe_shift        %u\n", meta->stripe_shift);
200         printf("array_width         %u\n", meta->array_width);
201         printf("array_number        %u\n", meta->array_number);
202         printf("total_sectors       %u\n", meta->total_sectors);
203         printf("cylinders           %u\n", meta->cylinders);
204         printf("heads               %u\n", meta->heads);
205         printf("sectors             %u\n", meta->sectors);
206         printf("volume_id           0x%016jx\n", meta->volume_id);
207         printf("disks:\n");
208         for (i = 0; i < PROMISE_MAX_DISKS; i++ ) {
209                 printf("                    %02x %02x %02x %02x %016jx\n",
210                     meta->disks[i].flags, meta->disks[i].number,
211                     meta->disks[i].channel, meta->disks[i].device,
212                     meta->disks[i].id);
213         }
214         printf("name                <%.32s>\n", meta->name);
215         printf("magic_3             0x%08x\n", meta->magic_3);
216         printf("rebuild_lba64       %ju\n", meta->rebuild_lba64);
217         printf("magic_4             0x%08x\n", meta->magic_4);
218         printf("magic_5             0x%08x\n", meta->magic_5);
219         printf("total_sectors_high  0x%08x\n", meta->total_sectors_high);
220         printf("=================================================\n");
221 }
222
223 static struct promise_raid_conf *
224 promise_meta_copy(struct promise_raid_conf *meta)
225 {
226         struct promise_raid_conf *nmeta;
227
228         nmeta = malloc(sizeof(*nmeta), M_MD_PROMISE, M_WAITOK);
229         memcpy(nmeta, meta, sizeof(*nmeta));
230         return (nmeta);
231 }
232
233 static int
234 promise_meta_find_disk(struct promise_raid_conf *meta, uint64_t id)
235 {
236         int pos;
237
238         for (pos = 0; pos < meta->total_disks; pos++) {
239                 if (meta->disks[pos].id == id)
240                         return (pos);
241         }
242         return (-1);
243 }
244
245 static int
246 promise_meta_unused_range(struct promise_raid_conf **metaarr, int nsd,
247     uint32_t sectors, uint32_t *off, uint32_t *size)
248 {
249         uint32_t coff, csize;
250         int i, j;
251
252         sectors -= 131072;
253         *off = 0;
254         *size = 0;
255         coff = 0;
256         csize = sectors;
257         i = 0;
258         while (1) {
259                 for (j = 0; j < nsd; j++) {
260                         if (metaarr[j]->disk_offset >= coff) {
261                                 csize = MIN(csize,
262                                     metaarr[j]->disk_offset - coff);
263                         }
264                 }
265                 if (csize > *size) {
266                         *off = coff;
267                         *size = csize;
268                 }
269                 if (i >= nsd)
270                         break;
271                 coff = metaarr[i]->disk_offset + metaarr[i]->disk_sectors;
272                 csize = sectors - coff;
273                 i++;
274         };
275         return ((*size > 0) ? 1 : 0);
276 }
277
278 static int
279 promise_meta_translate_disk(struct g_raid_volume *vol, int md_disk_pos)
280 {
281         int disk_pos, width;
282
283         if (md_disk_pos >= 0 && vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) {
284                 width = vol->v_disks_count / 2;
285                 disk_pos = (md_disk_pos / width) +
286                     (md_disk_pos % width) * width;
287         } else
288                 disk_pos = md_disk_pos;
289         return (disk_pos);
290 }
291
292 static void
293 promise_meta_get_name(struct promise_raid_conf *meta, char *buf)
294 {
295         int i;
296
297         strncpy(buf, meta->name, 32);
298         buf[32] = 0;
299         for (i = 31; i >= 0; i--) {
300                 if (buf[i] > 0x20)
301                         break;
302                 buf[i] = 0;
303         }
304 }
305
306 static void
307 promise_meta_put_name(struct promise_raid_conf *meta, char *buf)
308 {
309
310         memset(meta->name, 0x20, 32);
311         memcpy(meta->name, buf, MIN(strlen(buf), 32));
312 }
313
314 static int
315 promise_meta_read(struct g_consumer *cp, struct promise_raid_conf **metaarr)
316 {
317         struct g_provider *pp;
318         struct promise_raid_conf *meta;
319         char *buf;
320         int error, i, subdisks;
321         uint32_t checksum, *ptr;
322
323         pp = cp->provider;
324         subdisks = 0;
325 next:
326         /* Read metadata block. */
327         buf = g_read_data(cp, pp->mediasize - pp->sectorsize *
328             (63 - subdisks * PROMISE_META_OFFSET),
329             pp->sectorsize * 4, &error);
330         if (buf == NULL) {
331                 G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).",
332                     pp->name, error);
333                 return (subdisks);
334         }
335         meta = (struct promise_raid_conf *)buf;
336
337         /* Check if this is an Promise RAID struct */
338         if (strncmp(meta->promise_id, PROMISE_MAGIC, strlen(PROMISE_MAGIC)) &&
339             strncmp(meta->promise_id, FREEBSD_MAGIC, strlen(FREEBSD_MAGIC))) {
340                 if (subdisks == 0)
341                         G_RAID_DEBUG(1,
342                             "Promise signature check failed on %s", pp->name);
343                 g_free(buf);
344                 return (subdisks);
345         }
346         meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK);
347         memcpy(meta, buf, MIN(sizeof(*meta), pp->sectorsize * 4));
348         g_free(buf);
349
350         /* Check metadata checksum. */
351         for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++)
352                 checksum += *ptr++;
353         if (checksum != meta->checksum) {
354                 G_RAID_DEBUG(1, "Promise checksum check failed on %s", pp->name);
355                 free(meta, M_MD_PROMISE);
356                 return (subdisks);
357         }
358
359         if ((meta->integrity & PROMISE_I_VALID) == 0) {
360                 G_RAID_DEBUG(1, "Promise metadata is invalid on %s", pp->name);
361                 free(meta, M_MD_PROMISE);
362                 return (subdisks);
363         }
364
365         if (meta->total_disks > PROMISE_MAX_DISKS) {
366                 G_RAID_DEBUG(1, "Wrong number of disks on %s (%d)",
367                     pp->name, meta->total_disks);
368                 free(meta, M_MD_PROMISE);
369                 return (subdisks);
370         }
371
372         /* Save this part and look for next. */
373         *metaarr = meta;
374         metaarr++;
375         subdisks++;
376         if (subdisks < PROMISE_MAX_SUBDISKS)
377                 goto next;
378
379         return (subdisks);
380 }
381
382 static int
383 promise_meta_write(struct g_consumer *cp,
384     struct promise_raid_conf **metaarr, int nsd)
385 {
386         struct g_provider *pp;
387         struct promise_raid_conf *meta;
388         char *buf;
389         int error, i, subdisk, fake;
390         uint32_t checksum, *ptr, off, size;
391
392         pp = cp->provider;
393         subdisk = 0;
394         fake = 0;
395 next:
396         buf = malloc(pp->sectorsize * 4, M_MD_PROMISE, M_WAITOK | M_ZERO);
397         meta = NULL;
398         if (subdisk < nsd) {
399                 meta = metaarr[subdisk];
400         } else if (!fake && promise_meta_unused_range(metaarr, nsd,
401             cp->provider->mediasize / cp->provider->sectorsize,
402             &off, &size)) {
403                 /* Optionally add record for unused space. */
404                 meta = (struct promise_raid_conf *)buf;
405                 memcpy(&meta->promise_id[0], PROMISE_MAGIC,
406                     sizeof(PROMISE_MAGIC) - 1);
407                 meta->dummy_0 = 0x00020000;
408                 meta->integrity = PROMISE_I_VALID;
409                 meta->disk.flags = PROMISE_F_ONLINE | PROMISE_F_VALID;
410                 meta->disk.number = 0xff;
411                 arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0);
412                 meta->disk_offset = off;
413                 meta->disk_sectors = size;
414                 meta->rebuild_lba = UINT32_MAX;
415                 fake = 1;
416         }
417         if (meta != NULL) {
418                 /* Recalculate checksum for case if metadata were changed. */
419                 meta->checksum = 0;
420                 for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++)
421                         checksum += *ptr++;
422                 meta->checksum = checksum;
423                 memcpy(buf, meta, MIN(pp->sectorsize * 4, sizeof(*meta)));
424         }
425         error = g_write_data(cp, pp->mediasize - pp->sectorsize *
426             (63 - subdisk * PROMISE_META_OFFSET),
427             buf, pp->sectorsize * 4);
428         if (error != 0) {
429                 G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).",
430                     pp->name, error);
431         }
432         free(buf, M_MD_PROMISE);
433
434         subdisk++;
435         if (subdisk < PROMISE_MAX_SUBDISKS)
436                 goto next;
437
438         return (error);
439 }
440
441 static int
442 promise_meta_erase(struct g_consumer *cp)
443 {
444         struct g_provider *pp;
445         char *buf;
446         int error, subdisk;
447
448         pp = cp->provider;
449         buf = malloc(4 * pp->sectorsize, M_MD_PROMISE, M_WAITOK | M_ZERO);
450         for (subdisk = 0; subdisk < PROMISE_MAX_SUBDISKS; subdisk++) {
451                 error = g_write_data(cp, pp->mediasize - pp->sectorsize *
452                     (63 - subdisk * PROMISE_META_OFFSET),
453                     buf, 4 * pp->sectorsize);
454                 if (error != 0) {
455                         G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).",
456                             pp->name, error);
457                 }
458         }
459         free(buf, M_MD_PROMISE);
460         return (error);
461 }
462
463 static int
464 promise_meta_write_spare(struct g_consumer *cp)
465 {
466         struct promise_raid_conf *meta;
467         int error;
468
469         meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK | M_ZERO);
470         memcpy(&meta->promise_id[0], PROMISE_MAGIC, sizeof(PROMISE_MAGIC) - 1);
471         meta->dummy_0 = 0x00020000;
472         meta->integrity = PROMISE_I_VALID;
473         meta->disk.flags = PROMISE_F_SPARE | PROMISE_F_ONLINE | PROMISE_F_VALID;
474         meta->disk.number = 0xff;
475         arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0);
476         meta->disk_sectors = cp->provider->mediasize / cp->provider->sectorsize;
477         meta->disk_sectors -= 131072;
478         meta->rebuild_lba = UINT32_MAX;
479         error = promise_meta_write(cp, &meta, 1);
480         free(meta, M_MD_PROMISE);
481         return (error);
482 }
483
484 static struct g_raid_volume *
485 g_raid_md_promise_get_volume(struct g_raid_softc *sc, uint64_t id)
486 {
487         struct g_raid_volume    *vol;
488         struct g_raid_md_promise_pervolume *pv;
489
490         TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
491                 pv = vol->v_md_data;
492                 if (pv->pv_id == id)
493                         break;
494         }
495         return (vol);
496 }
497
498 static int
499 g_raid_md_promise_purge_volumes(struct g_raid_softc *sc)
500 {
501         struct g_raid_volume    *vol, *tvol;
502         struct g_raid_md_promise_pervolume *pv;
503         int i, res;
504
505         res = 0;
506         TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tvol) {
507                 pv = vol->v_md_data;
508                 if (!pv->pv_started || vol->v_stopping)
509                         continue;
510                 for (i = 0; i < vol->v_disks_count; i++) {
511                         if (vol->v_subdisks[i].sd_state != G_RAID_SUBDISK_S_NONE)
512                                 break;
513                 }
514                 if (i >= vol->v_disks_count) {
515                         g_raid_destroy_volume(vol);
516                         res = 1;
517                 }
518         }
519         return (res);
520 }
521
522 static int
523 g_raid_md_promise_purge_disks(struct g_raid_softc *sc)
524 {
525         struct g_raid_disk      *disk, *tdisk;
526         struct g_raid_volume    *vol;
527         struct g_raid_md_promise_perdisk *pd;
528         int i, j, res;
529
530         res = 0;
531         TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) {
532                 if (disk->d_state == G_RAID_DISK_S_SPARE)
533                         continue;
534                 pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
535
536                 /* Scan for deleted volumes. */
537                 for (i = 0; i < pd->pd_subdisks; ) {
538                         vol = g_raid_md_promise_get_volume(sc,
539                             pd->pd_meta[i]->volume_id);
540                         if (vol != NULL && !vol->v_stopping) {
541                                 i++;
542                                 continue;
543                         }
544                         free(pd->pd_meta[i], M_MD_PROMISE);
545                         for (j = i; j < pd->pd_subdisks - 1; j++)
546                                 pd->pd_meta[j] = pd->pd_meta[j + 1];
547                         pd->pd_meta[PROMISE_MAX_SUBDISKS - 1] = NULL;
548                         pd->pd_subdisks--;
549                         pd->pd_updated = 1;
550                 }
551
552                 /* If there is no metadata left - erase and delete disk. */
553                 if (pd->pd_subdisks == 0) {
554                         promise_meta_erase(disk->d_consumer);
555                         g_raid_destroy_disk(disk);
556                         res = 1;
557                 }
558         }
559         return (res);
560 }
561
562 static int
563 g_raid_md_promise_supported(int level, int qual, int disks, int force)
564 {
565
566         if (disks > PROMISE_MAX_DISKS)
567                 return (0);
568         switch (level) {
569         case G_RAID_VOLUME_RL_RAID0:
570                 if (disks < 1)
571                         return (0);
572                 if (!force && disks < 2)
573                         return (0);
574                 break;
575         case G_RAID_VOLUME_RL_RAID1:
576                 if (disks < 1)
577                         return (0);
578                 if (!force && (disks != 2))
579                         return (0);
580                 break;
581         case G_RAID_VOLUME_RL_RAID1E:
582                 if (disks < 2)
583                         return (0);
584                 if (disks % 2 != 0)
585                         return (0);
586                 if (!force && (disks != 4))
587                         return (0);
588                 break;
589         case G_RAID_VOLUME_RL_SINGLE:
590                 if (disks != 1)
591                         return (0);
592                 break;
593         case G_RAID_VOLUME_RL_CONCAT:
594                 if (disks < 2)
595                         return (0);
596                 break;
597         case G_RAID_VOLUME_RL_RAID5:
598                 if (disks < 3)
599                         return (0);
600                 if (qual != G_RAID_VOLUME_RLQ_R5LA)
601                         return (0);
602                 break;
603         default:
604                 return (0);
605         }
606         if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE)
607                 return (0);
608         return (1);
609 }
610
611 static int
612 g_raid_md_promise_start_disk(struct g_raid_disk *disk, int sdn,
613     struct g_raid_volume *vol)
614 {
615         struct g_raid_softc *sc;
616         struct g_raid_subdisk *sd;
617         struct g_raid_md_promise_perdisk *pd;
618         struct g_raid_md_promise_pervolume *pv;
619         struct promise_raid_conf *meta;
620         off_t size;
621         int disk_pos, md_disk_pos, i, resurrection = 0;
622         uint32_t eoff, esize;
623
624         sc = disk->d_softc;
625         pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
626
627         pv = vol->v_md_data;
628         meta = pv->pv_meta;
629
630         if (sdn >= 0) {
631                 /* Find disk position in metadata by it's serial. */
632                 md_disk_pos = promise_meta_find_disk(meta, pd->pd_meta[sdn]->disk.id);
633                 /* For RAID0+1 we need to translate order. */
634                 disk_pos = promise_meta_translate_disk(vol, md_disk_pos);
635         } else {
636                 md_disk_pos = -1;
637                 disk_pos = -1;
638         }
639         if (disk_pos < 0) {
640                 G_RAID_DEBUG1(1, sc, "Disk %s is not part of the volume %s",
641                     g_raid_get_diskname(disk), vol->v_name);
642                 /* Failed stale disk is useless for us. */
643                 if (sdn >= 0 &&
644                     pd->pd_meta[sdn]->disk.flags & PROMISE_F_DOWN) {
645                         g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED);
646                         return (0);
647                 }
648                 /* If we were given specific metadata subdisk - erase it. */
649                 if (sdn >= 0) {
650                         free(pd->pd_meta[sdn], M_MD_PROMISE);
651                         for (i = sdn; i < pd->pd_subdisks - 1; i++)
652                                 pd->pd_meta[i] = pd->pd_meta[i + 1];
653                         pd->pd_meta[PROMISE_MAX_SUBDISKS - 1] = NULL;
654                         pd->pd_subdisks--;
655                 }
656                 /* If we are in the start process, that's all for now. */
657                 if (!pv->pv_started)
658                         goto nofit;
659                 /*
660                  * If we have already started - try to get use of the disk.
661                  * Try to replace OFFLINE disks first, then FAILED.
662                  */
663                 promise_meta_unused_range(pd->pd_meta, pd->pd_subdisks,
664                     disk->d_consumer->provider->mediasize /
665                     disk->d_consumer->provider->sectorsize,
666                     &eoff, &esize);
667                 if (esize == 0) {
668                         G_RAID_DEBUG1(1, sc, "No free space on disk %s",
669                             g_raid_get_diskname(disk));
670                         goto nofit;
671                 }
672                 size = INT64_MAX;
673                 for (i = 0; i < vol->v_disks_count; i++) {
674                         sd = &vol->v_subdisks[i];
675                         if (sd->sd_state != G_RAID_SUBDISK_S_NONE)
676                                 size = sd->sd_size;
677                         if (sd->sd_state <= G_RAID_SUBDISK_S_FAILED &&
678                             (disk_pos < 0 ||
679                              vol->v_subdisks[i].sd_state < sd->sd_state))
680                                 disk_pos = i;
681                 }
682                 if (disk_pos >= 0 &&
683                     vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT &&
684                     (off_t)esize * 512 < size) {
685                         G_RAID_DEBUG1(1, sc, "Disk %s free space "
686                             "is too small (%ju < %ju)",
687                             g_raid_get_diskname(disk),
688                             (off_t)esize * 512, size);
689                         disk_pos = -1;
690                 }
691                 if (disk_pos >= 0) {
692                         if (vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT)
693                                 esize = size / 512;
694                         /* For RAID0+1 we need to translate order. */
695                         md_disk_pos = promise_meta_translate_disk(vol, disk_pos);
696                 } else {
697 nofit:
698                         if (pd->pd_subdisks == 0) {
699                                 g_raid_change_disk_state(disk,
700                                     G_RAID_DISK_S_SPARE);
701                         }
702                         return (0);
703                 }
704                 G_RAID_DEBUG1(1, sc, "Disk %s takes pos %d in the volume %s",
705                     g_raid_get_diskname(disk), disk_pos, vol->v_name);
706                 resurrection = 1;
707         }
708
709         sd = &vol->v_subdisks[disk_pos];
710
711         if (resurrection && sd->sd_disk != NULL) {
712                 g_raid_change_disk_state(sd->sd_disk,
713                     G_RAID_DISK_S_STALE_FAILED);
714                 TAILQ_REMOVE(&sd->sd_disk->d_subdisks,
715                     sd, sd_next);
716         }
717         vol->v_subdisks[disk_pos].sd_disk = disk;
718         TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
719
720         /* Welcome the new disk. */
721         if (resurrection)
722                 g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
723         else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN)
724                 g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED);
725         else
726                 g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
727
728         if (resurrection) {
729                 sd->sd_offset = (off_t)eoff * 512;
730                 sd->sd_size = (off_t)esize * 512;
731         } else {
732                 sd->sd_offset = (off_t)pd->pd_meta[sdn]->disk_offset * 512;
733                 sd->sd_size = (off_t)pd->pd_meta[sdn]->disk_sectors * 512;
734         }
735
736         if (resurrection) {
737                 /* Stale disk, almost same as new. */
738                 g_raid_change_subdisk_state(sd,
739                     G_RAID_SUBDISK_S_NEW);
740         } else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN) {
741                 /* Failed disk. */
742                 g_raid_change_subdisk_state(sd,
743                     G_RAID_SUBDISK_S_FAILED);
744         } else if (meta->disks[md_disk_pos].flags & PROMISE_F_REDIR) {
745                 /* Rebuilding disk. */
746                 g_raid_change_subdisk_state(sd,
747                     G_RAID_SUBDISK_S_REBUILD);
748                 if (pd->pd_meta[sdn]->generation != meta->generation)
749                         sd->sd_rebuild_pos = 0;
750                 else {
751                         sd->sd_rebuild_pos =
752                             (off_t)pd->pd_meta[sdn]->rebuild_lba * 512;
753                 }
754         } else if (!(meta->disks[md_disk_pos].flags & PROMISE_F_ONLINE)) {
755                 /* Rebuilding disk. */
756                 g_raid_change_subdisk_state(sd,
757                     G_RAID_SUBDISK_S_NEW);
758         } else if (pd->pd_meta[sdn]->generation != meta->generation ||
759             (meta->status & PROMISE_S_MARKED)) {
760                 /* Stale disk or dirty volume (unclean shutdown). */
761                 g_raid_change_subdisk_state(sd,
762                     G_RAID_SUBDISK_S_STALE);
763         } else {
764                 /* Up to date disk. */
765                 g_raid_change_subdisk_state(sd,
766                     G_RAID_SUBDISK_S_ACTIVE);
767         }
768         g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
769             G_RAID_EVENT_SUBDISK);
770
771         return (resurrection);
772 }
773
774 static void
775 g_raid_md_promise_refill(struct g_raid_softc *sc)
776 {
777         struct g_raid_volume *vol;
778         struct g_raid_subdisk *sd;
779         struct g_raid_disk *disk;
780         struct g_raid_md_object *md;
781         struct g_raid_md_promise_perdisk *pd;
782         struct g_raid_md_promise_pervolume *pv;
783         int update, updated, i, bad;
784
785         md = sc->sc_md;
786 restart:
787         updated = 0;
788         TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
789                 pv = vol->v_md_data;
790                 if (!pv->pv_started || vol->v_stopping)
791                         continue;
792
793                 /* Search for subdisk that needs replacement. */
794                 bad = 0;
795                 for (i = 0; i < vol->v_disks_count; i++) {
796                         sd = &vol->v_subdisks[i];
797                         if (sd->sd_state == G_RAID_SUBDISK_S_NONE ||
798                             sd->sd_state == G_RAID_SUBDISK_S_FAILED)
799                                 bad = 1;
800                 }
801                 if (!bad)
802                         continue;
803
804                 G_RAID_DEBUG1(1, sc, "Volume %s is not complete, "
805                     "trying to refill.", vol->v_name);
806
807                 TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
808                         /* Skip failed. */
809                         if (disk->d_state < G_RAID_DISK_S_SPARE)
810                                 continue;
811                         /* Skip already used by this volume. */
812                         for (i = 0; i < vol->v_disks_count; i++) {
813                                 sd = &vol->v_subdisks[i];
814                                 if (sd->sd_disk == disk)
815                                         break;
816                         }
817                         if (i < vol->v_disks_count)
818                                 continue;
819
820                         /* Try to use disk if it has empty extents. */
821                         pd = disk->d_md_data;
822                         if (pd->pd_subdisks < PROMISE_MAX_SUBDISKS) {
823                                 update =
824                                     g_raid_md_promise_start_disk(disk, -1, vol);
825                         } else
826                                 update = 0;
827                         if (update) {
828                                 updated = 1;
829                                 g_raid_md_write_promise(md, vol, NULL, disk);
830                                 break;
831                         }
832                 }
833         }
834         if (updated)
835                 goto restart;
836 }
837
838 static void
839 g_raid_md_promise_start(struct g_raid_volume *vol)
840 {
841         struct g_raid_softc *sc;
842         struct g_raid_subdisk *sd;
843         struct g_raid_disk *disk;
844         struct g_raid_md_object *md;
845         struct g_raid_md_promise_perdisk *pd;
846         struct g_raid_md_promise_pervolume *pv;
847         struct promise_raid_conf *meta;
848         int i;
849
850         sc = vol->v_softc;
851         md = sc->sc_md;
852         pv = vol->v_md_data;
853         meta = pv->pv_meta;
854
855         vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE;
856         if (meta->type == PROMISE_T_RAID0)
857                 vol->v_raid_level = G_RAID_VOLUME_RL_RAID0;
858         else if (meta->type == PROMISE_T_RAID1) {
859                 if (meta->array_width == 1)
860                         vol->v_raid_level = G_RAID_VOLUME_RL_RAID1;
861                 else
862                         vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E;
863         } else if (meta->type == PROMISE_T_RAID3)
864                 vol->v_raid_level = G_RAID_VOLUME_RL_RAID3;
865         else if (meta->type == PROMISE_T_RAID5) {
866                 vol->v_raid_level = G_RAID_VOLUME_RL_RAID5;
867                 vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA;
868         } else if (meta->type == PROMISE_T_SPAN)
869                 vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT;
870         else if (meta->type == PROMISE_T_JBOD)
871                 vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE;
872         else
873                 vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
874         vol->v_strip_size = 512 << meta->stripe_shift; //ZZZ
875         vol->v_disks_count = meta->total_disks;
876         vol->v_mediasize = (off_t)meta->total_sectors * 512; //ZZZ
877         if (meta->total_sectors_high < 256) /* If value looks sane. */
878                 vol->v_mediasize |=
879                     ((off_t)meta->total_sectors_high << 32) * 512; //ZZZ
880         vol->v_sectorsize = 512; //ZZZ
881         for (i = 0; i < vol->v_disks_count; i++) {
882                 sd = &vol->v_subdisks[i];
883                 sd->sd_offset = (off_t)meta->disk_offset * 512; //ZZZ
884                 sd->sd_size = (off_t)meta->disk_sectors * 512; //ZZZ
885         }
886         g_raid_start_volume(vol);
887
888         /* Make all disks found till the moment take their places. */
889         TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
890                 pd = disk->d_md_data;
891                 for (i = 0; i < pd->pd_subdisks; i++) {
892                         if (pd->pd_meta[i]->volume_id == meta->volume_id)
893                                 g_raid_md_promise_start_disk(disk, i, vol);
894                 }
895         }
896
897         pv->pv_started = 1;
898         callout_stop(&pv->pv_start_co);
899         G_RAID_DEBUG1(0, sc, "Volume started.");
900         g_raid_md_write_promise(md, vol, NULL, NULL);
901
902         /* Pickup any STALE/SPARE disks to refill array if needed. */
903         g_raid_md_promise_refill(sc);
904
905         g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME);
906 }
907
908 static void
909 g_raid_promise_go(void *arg)
910 {
911         struct g_raid_volume *vol;
912         struct g_raid_softc *sc;
913         struct g_raid_md_promise_pervolume *pv;
914
915         vol = arg;
916         pv = vol->v_md_data;
917         sc = vol->v_softc;
918         if (!pv->pv_started) {
919                 G_RAID_DEBUG1(0, sc, "Force volume start due to timeout.");
920                 g_raid_event_send(vol, G_RAID_VOLUME_E_STARTMD,
921                     G_RAID_EVENT_VOLUME);
922         }
923 }
924
925 static void
926 g_raid_md_promise_new_disk(struct g_raid_disk *disk)
927 {
928         struct g_raid_softc *sc;
929         struct g_raid_md_object *md;
930         struct promise_raid_conf *pdmeta;
931         struct g_raid_md_promise_perdisk *pd;
932         struct g_raid_md_promise_pervolume *pv;
933         struct g_raid_volume *vol;
934         int i;
935         char buf[33];
936
937         sc = disk->d_softc;
938         md = sc->sc_md;
939         pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
940
941         if (pd->pd_subdisks == 0) {
942                 g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
943                 g_raid_md_promise_refill(sc);
944                 return;
945         }
946
947         for (i = 0; i < pd->pd_subdisks; i++) {
948                 pdmeta = pd->pd_meta[i];
949
950                 /* Look for volume with matching ID. */
951                 vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id);
952                 if (vol == NULL) {
953                         promise_meta_get_name(pdmeta, buf);
954                         vol = g_raid_create_volume(sc, buf, pdmeta->array_number);
955                         pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK | M_ZERO);
956                         pv->pv_id = pdmeta->volume_id;
957                         vol->v_md_data = pv;
958                         callout_init(&pv->pv_start_co, 1);
959                         callout_reset(&pv->pv_start_co,
960                             g_raid_start_timeout * hz,
961                             g_raid_promise_go, vol);
962                 } else
963                         pv = vol->v_md_data;
964
965                 /* If we haven't started yet - check metadata freshness. */
966                 if (pv->pv_meta == NULL || !pv->pv_started) {
967                         if (pv->pv_meta == NULL ||
968                             ((int16_t)(pdmeta->generation - pv->pv_generation)) > 0) {
969                                 G_RAID_DEBUG1(1, sc, "Newer disk");
970                                 if (pv->pv_meta != NULL)
971                                         free(pv->pv_meta, M_MD_PROMISE);
972                                 pv->pv_meta = promise_meta_copy(pdmeta);
973                                 pv->pv_generation = pv->pv_meta->generation;
974                                 pv->pv_disks_present = 1;
975                         } else if (pdmeta->generation == pv->pv_generation) {
976                                 pv->pv_disks_present++;
977                                 G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)",
978                                     pv->pv_disks_present,
979                                     pv->pv_meta->total_disks);
980                         } else {
981                                 G_RAID_DEBUG1(1, sc, "Older disk");
982                         }
983                 }
984         }
985
986         for (i = 0; i < pd->pd_subdisks; i++) {
987                 pdmeta = pd->pd_meta[i];
988
989                 /* Look for volume with matching ID. */
990                 vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id);
991                 if (vol == NULL)
992                         continue;
993                 pv = vol->v_md_data;
994
995                 if (pv->pv_started) {
996                         if (g_raid_md_promise_start_disk(disk, i, vol))
997                                 g_raid_md_write_promise(md, vol, NULL, NULL);
998                 } else {
999                         /* If we collected all needed disks - start array. */
1000                         if (pv->pv_disks_present == pv->pv_meta->total_disks)
1001                                 g_raid_md_promise_start(vol);
1002                 }
1003         }
1004 }
1005
1006 static int
1007 g_raid_md_create_promise(struct g_raid_md_object *md, struct g_class *mp,
1008     struct g_geom **gp)
1009 {
1010         struct g_geom *geom;
1011         struct g_raid_softc *sc;
1012
1013         /* Search for existing node. */
1014         LIST_FOREACH(geom, &mp->geom, geom) {
1015                 sc = geom->softc;
1016                 if (sc == NULL)
1017                         continue;
1018                 if (sc->sc_stopping != 0)
1019                         continue;
1020                 if (sc->sc_md->mdo_class != md->mdo_class)
1021                         continue;
1022                 break;
1023         }
1024         if (geom != NULL) {
1025                 *gp = geom;
1026                 return (G_RAID_MD_TASTE_EXISTING);
1027         }
1028
1029         /* Create new one if not found. */
1030         sc = g_raid_create_node(mp, "Promise", md);
1031         if (sc == NULL)
1032                 return (G_RAID_MD_TASTE_FAIL);
1033         md->mdo_softc = sc;
1034         *gp = sc->sc_geom;
1035         return (G_RAID_MD_TASTE_NEW);
1036 }
1037
1038 static int
1039 g_raid_md_taste_promise(struct g_raid_md_object *md, struct g_class *mp,
1040                               struct g_consumer *cp, struct g_geom **gp)
1041 {
1042         struct g_consumer *rcp;
1043         struct g_provider *pp;
1044         struct g_raid_softc *sc;
1045         struct g_raid_disk *disk;
1046         struct promise_raid_conf *meta, *metaarr[4];
1047         struct g_raid_md_promise_perdisk *pd;
1048         struct g_geom *geom;
1049         int error, i, j, result, len, subdisks;
1050         char name[16];
1051         uint16_t vendor;
1052
1053         G_RAID_DEBUG(1, "Tasting Promise on %s", cp->provider->name);
1054         pp = cp->provider;
1055
1056         /* Read metadata from device. */
1057         meta = NULL;
1058         vendor = 0xffff;
1059         if (g_access(cp, 1, 0, 0) != 0)
1060                 return (G_RAID_MD_TASTE_FAIL);
1061         g_topology_unlock();
1062         len = 2;
1063         if (pp->geom->rank == 1)
1064                 g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor);
1065         subdisks = promise_meta_read(cp, metaarr);
1066         g_topology_lock();
1067         g_access(cp, -1, 0, 0);
1068         if (subdisks == 0) {
1069                 if (g_raid_aggressive_spare) {
1070                         if (vendor == 0x105a || vendor == 0x1002) {
1071                                 G_RAID_DEBUG(1,
1072                                     "No Promise metadata, forcing spare.");
1073                                 goto search;
1074                         } else {
1075                                 G_RAID_DEBUG(1,
1076                                     "Promise/ATI vendor mismatch "
1077                                     "0x%04x != 0x105a/0x1002",
1078                                     vendor);
1079                         }
1080                 }
1081                 return (G_RAID_MD_TASTE_FAIL);
1082         }
1083
1084         /* Metadata valid. Print it. */
1085         for (i = 0; i < subdisks; i++)
1086                 g_raid_md_promise_print(metaarr[i]);
1087
1088         /* Purge meaningless (empty/spare) records. */
1089         for (i = 0; i < subdisks; ) {
1090                 if (metaarr[i]->disk.flags & PROMISE_F_ASSIGNED) {
1091                         i++;
1092                         continue;
1093                 }
1094                 free(metaarr[i], M_MD_PROMISE);
1095                 for (j = i; j < subdisks - 1; j++)
1096                         metaarr[i] = metaarr[j + 1];
1097                 metaarr[PROMISE_MAX_SUBDISKS - 1] = NULL;
1098                 subdisks--;
1099         }
1100
1101 search:
1102         /* Search for matching node. */
1103         sc = NULL;
1104         LIST_FOREACH(geom, &mp->geom, geom) {
1105                 sc = geom->softc;
1106                 if (sc == NULL)
1107                         continue;
1108                 if (sc->sc_stopping != 0)
1109                         continue;
1110                 if (sc->sc_md->mdo_class != md->mdo_class)
1111                         continue;
1112                 break;
1113         }
1114
1115         /* Found matching node. */
1116         if (geom != NULL) {
1117                 G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name);
1118                 result = G_RAID_MD_TASTE_EXISTING;
1119
1120         } else { /* Not found matching node -- create one. */
1121                 result = G_RAID_MD_TASTE_NEW;
1122                 snprintf(name, sizeof(name), "Promise");
1123                 sc = g_raid_create_node(mp, name, md);
1124                 md->mdo_softc = sc;
1125                 geom = sc->sc_geom;
1126         }
1127
1128         rcp = g_new_consumer(geom);
1129         g_attach(rcp, pp);
1130         if (g_access(rcp, 1, 1, 1) != 0)
1131                 ; //goto fail1;
1132
1133         g_topology_unlock();
1134         sx_xlock(&sc->sc_lock);
1135
1136         pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO);
1137         pd->pd_subdisks = subdisks;
1138         for (i = 0; i < subdisks; i++)
1139                 pd->pd_meta[i] = metaarr[i];
1140         disk = g_raid_create_disk(sc);
1141         disk->d_md_data = (void *)pd;
1142         disk->d_consumer = rcp;
1143         rcp->private = disk;
1144
1145         /* Read kernel dumping information. */
1146         disk->d_kd.offset = 0;
1147         disk->d_kd.length = OFF_MAX;
1148         len = sizeof(disk->d_kd);
1149         error = g_io_getattr("GEOM::kerneldump", rcp, &len, &disk->d_kd);
1150         if (disk->d_kd.di.dumper == NULL)
1151                 G_RAID_DEBUG1(2, sc, "Dumping not supported by %s: %d.", 
1152                     rcp->provider->name, error);
1153
1154         g_raid_md_promise_new_disk(disk);
1155
1156         sx_xunlock(&sc->sc_lock);
1157         g_topology_lock();
1158         *gp = geom;
1159         return (result);
1160 }
1161
1162 static int
1163 g_raid_md_event_promise(struct g_raid_md_object *md,
1164     struct g_raid_disk *disk, u_int event)
1165 {
1166         struct g_raid_softc *sc;
1167
1168         sc = md->mdo_softc;
1169         if (disk == NULL)
1170                 return (-1);
1171         switch (event) {
1172         case G_RAID_DISK_E_DISCONNECTED:
1173                 /* Delete disk. */
1174                 g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
1175                 g_raid_destroy_disk(disk);
1176                 g_raid_md_promise_purge_volumes(sc);
1177
1178                 /* Write updated metadata to all disks. */
1179                 g_raid_md_write_promise(md, NULL, NULL, NULL);
1180
1181                 /* Check if anything left. */
1182                 if (g_raid_ndisks(sc, -1) == 0)
1183                         g_raid_destroy_node(sc, 0);
1184                 else
1185                         g_raid_md_promise_refill(sc);
1186                 return (0);
1187         }
1188         return (-2);
1189 }
1190
1191 static int
1192 g_raid_md_volume_event_promise(struct g_raid_md_object *md,
1193     struct g_raid_volume *vol, u_int event)
1194 {
1195         struct g_raid_md_promise_pervolume *pv;
1196
1197         pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data;
1198         switch (event) {
1199         case G_RAID_VOLUME_E_STARTMD:
1200                 if (!pv->pv_started)
1201                         g_raid_md_promise_start(vol);
1202                 return (0);
1203         }
1204         return (-2);
1205 }
1206
1207 static int
1208 g_raid_md_ctl_promise(struct g_raid_md_object *md,
1209     struct gctl_req *req)
1210 {
1211         struct g_raid_softc *sc;
1212         struct g_raid_volume *vol, *vol1;
1213         struct g_raid_subdisk *sd;
1214         struct g_raid_disk *disk, *disks[PROMISE_MAX_DISKS];
1215         struct g_raid_md_promise_perdisk *pd;
1216         struct g_raid_md_promise_pervolume *pv;
1217         struct g_consumer *cp;
1218         struct g_provider *pp;
1219         char arg[16];
1220         const char *verb, *volname, *levelname, *diskname;
1221         char *tmp;
1222         int *nargs, *force;
1223         off_t size, sectorsize, strip;
1224         intmax_t *sizearg, *striparg;
1225         uint32_t offs[PROMISE_MAX_DISKS], esize;
1226         int numdisks, i, len, level, qual;
1227         int error;
1228
1229         sc = md->mdo_softc;
1230         verb = gctl_get_param(req, "verb", NULL);
1231         nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
1232         error = 0;
1233         if (strcmp(verb, "label") == 0) {
1234
1235                 if (*nargs < 4) {
1236                         gctl_error(req, "Invalid number of arguments.");
1237                         return (-1);
1238                 }
1239                 volname = gctl_get_asciiparam(req, "arg1");
1240                 if (volname == NULL) {
1241                         gctl_error(req, "No volume name.");
1242                         return (-2);
1243                 }
1244                 levelname = gctl_get_asciiparam(req, "arg2");
1245                 if (levelname == NULL) {
1246                         gctl_error(req, "No RAID level.");
1247                         return (-3);
1248                 }
1249                 if (strcasecmp(levelname, "RAID5") == 0)
1250                         levelname = "RAID5-LA";
1251                 if (g_raid_volume_str2level(levelname, &level, &qual)) {
1252                         gctl_error(req, "Unknown RAID level '%s'.", levelname);
1253                         return (-4);
1254                 }
1255                 numdisks = *nargs - 3;
1256                 force = gctl_get_paraml(req, "force", sizeof(*force));
1257                 if (!g_raid_md_promise_supported(level, qual, numdisks,
1258                     force ? *force : 0)) {
1259                         gctl_error(req, "Unsupported RAID level "
1260                             "(0x%02x/0x%02x), or number of disks (%d).",
1261                             level, qual, numdisks);
1262                         return (-5);
1263                 }
1264
1265                 /* Search for disks, connect them and probe. */
1266                 size = INT64_MAX;
1267                 sectorsize = 0;
1268                 bzero(disks, sizeof(disks));
1269                 bzero(offs, sizeof(offs));
1270                 for (i = 0; i < numdisks; i++) {
1271                         snprintf(arg, sizeof(arg), "arg%d", i + 3);
1272                         diskname = gctl_get_asciiparam(req, arg);
1273                         if (diskname == NULL) {
1274                                 gctl_error(req, "No disk name (%s).", arg);
1275                                 error = -6;
1276                                 break;
1277                         }
1278                         if (strcmp(diskname, "NONE") == 0)
1279                                 continue;
1280
1281                         TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1282                                 if (disk->d_consumer != NULL && 
1283                                     disk->d_consumer->provider != NULL &&
1284                                     strcmp(disk->d_consumer->provider->name,
1285                                      diskname) == 0)
1286                                         break;
1287                         }
1288                         if (disk != NULL) {
1289                                 if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
1290                                         gctl_error(req, "Disk '%s' is in a "
1291                                             "wrong state (%s).", diskname,
1292                                             g_raid_disk_state2str(disk->d_state));
1293                                         error = -7;
1294                                         break;
1295                                 }
1296                                 pd = disk->d_md_data;
1297                                 if (pd->pd_subdisks >= PROMISE_MAX_SUBDISKS) {
1298                                         gctl_error(req, "Disk '%s' already "
1299                                             "used by %d volumes.",
1300                                             diskname, pd->pd_subdisks);
1301                                         error = -7;
1302                                         break;
1303                                 }
1304                                 pp = disk->d_consumer->provider;
1305                                 disks[i] = disk;
1306                                 promise_meta_unused_range(pd->pd_meta,
1307                                     pd->pd_subdisks,
1308                                     pp->mediasize / pp->sectorsize,
1309                                     &offs[i], &esize);
1310                                 size = MIN(size, (off_t)esize * pp->sectorsize);
1311                                 sectorsize = MAX(sectorsize, pp->sectorsize);
1312                                 continue;
1313                         }
1314
1315                         g_topology_lock();
1316                         cp = g_raid_open_consumer(sc, diskname);
1317                         if (cp == NULL) {
1318                                 gctl_error(req, "Can't open disk '%s'.",
1319                                     diskname);
1320                                 g_topology_unlock();
1321                                 error = -8;
1322                                 break;
1323                         }
1324                         pp = cp->provider;
1325                         pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO);
1326                         disk = g_raid_create_disk(sc);
1327                         disk->d_md_data = (void *)pd;
1328                         disk->d_consumer = cp;
1329                         disks[i] = disk;
1330                         cp->private = disk;
1331                         g_topology_unlock();
1332
1333                         if (pp->mediasize / pp->sectorsize > UINT32_MAX) {
1334                                 gctl_error(req,
1335                                     "Disk '%s' is too big.", diskname);
1336                                 error = -8;
1337                                 break;
1338                         }
1339
1340                         /* Read kernel dumping information. */
1341                         disk->d_kd.offset = 0;
1342                         disk->d_kd.length = OFF_MAX;
1343                         len = sizeof(disk->d_kd);
1344                         g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd);
1345                         if (disk->d_kd.di.dumper == NULL)
1346                                 G_RAID_DEBUG1(2, sc,
1347                                     "Dumping not supported by %s.",
1348                                     cp->provider->name);
1349
1350                         /* Reserve some space for metadata. */
1351                         size = MIN(size, pp->mediasize - 131072llu * pp->sectorsize);
1352                         sectorsize = MAX(sectorsize, pp->sectorsize);
1353                 }
1354                 if (error != 0) {
1355                         for (i = 0; i < numdisks; i++) {
1356                                 if (disks[i] != NULL &&
1357                                     disks[i]->d_state == G_RAID_DISK_S_NONE)
1358                                         g_raid_destroy_disk(disks[i]);
1359                         }
1360                         return (error);
1361                 }
1362
1363                 if (sectorsize <= 0) {
1364                         gctl_error(req, "Can't get sector size.");
1365                         return (-8);
1366                 }
1367
1368                 /* Handle size argument. */
1369                 len = sizeof(*sizearg);
1370                 sizearg = gctl_get_param(req, "size", &len);
1371                 if (sizearg != NULL && len == sizeof(*sizearg) &&
1372                     *sizearg > 0) {
1373                         if (*sizearg > size) {
1374                                 gctl_error(req, "Size too big %lld > %lld.",
1375                                     (long long)*sizearg, (long long)size);
1376                                 return (-9);
1377                         }
1378                         size = *sizearg;
1379                 }
1380
1381                 /* Handle strip argument. */
1382                 strip = 131072;
1383                 len = sizeof(*striparg);
1384                 striparg = gctl_get_param(req, "strip", &len);
1385                 if (striparg != NULL && len == sizeof(*striparg) &&
1386                     *striparg > 0) {
1387                         if (*striparg < sectorsize) {
1388                                 gctl_error(req, "Strip size too small.");
1389                                 return (-10);
1390                         }
1391                         if (*striparg % sectorsize != 0) {
1392                                 gctl_error(req, "Incorrect strip size.");
1393                                 return (-11);
1394                         }
1395                         strip = *striparg;
1396                 }
1397
1398                 /* Round size down to strip or sector. */
1399                 if (level == G_RAID_VOLUME_RL_RAID1 ||
1400                     level == G_RAID_VOLUME_RL_SINGLE ||
1401                     level == G_RAID_VOLUME_RL_CONCAT)
1402                         size -= (size % sectorsize);
1403                 else if (level == G_RAID_VOLUME_RL_RAID1E &&
1404                     (numdisks & 1) != 0)
1405                         size -= (size % (2 * strip));
1406                 else
1407                         size -= (size % strip);
1408                 if (size <= 0) {
1409                         gctl_error(req, "Size too small.");
1410                         return (-13);
1411                 }
1412                 if (size > 0xffffffffllu * sectorsize) {
1413                         gctl_error(req, "Size too big.");
1414                         return (-14);
1415                 }
1416
1417                 /* We have all we need, create things: volume, ... */
1418                 pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK | M_ZERO);
1419                 arc4rand(&pv->pv_id, sizeof(pv->pv_id), 0);
1420                 pv->pv_generation = 0;
1421                 pv->pv_started = 1;
1422                 vol = g_raid_create_volume(sc, volname, -1);
1423                 vol->v_md_data = pv;
1424                 vol->v_raid_level = level;
1425                 vol->v_raid_level_qualifier = qual;
1426                 vol->v_strip_size = strip;
1427                 vol->v_disks_count = numdisks;
1428                 if (level == G_RAID_VOLUME_RL_RAID0 ||
1429                     level == G_RAID_VOLUME_RL_CONCAT ||
1430                     level == G_RAID_VOLUME_RL_SINGLE)
1431                         vol->v_mediasize = size * numdisks;
1432                 else if (level == G_RAID_VOLUME_RL_RAID1)
1433                         vol->v_mediasize = size;
1434                 else if (level == G_RAID_VOLUME_RL_RAID3 ||
1435                     level == G_RAID_VOLUME_RL_RAID5)
1436                         vol->v_mediasize = size * (numdisks - 1);
1437                 else { /* RAID1E */
1438                         vol->v_mediasize = ((size * numdisks) / strip / 2) *
1439                             strip;
1440                 }
1441                 vol->v_sectorsize = sectorsize;
1442                 g_raid_start_volume(vol);
1443
1444                 /* , and subdisks. */
1445                 for (i = 0; i < numdisks; i++) {
1446                         disk = disks[i];
1447                         sd = &vol->v_subdisks[i];
1448                         sd->sd_disk = disk;
1449                         sd->sd_offset = (off_t)offs[i] * 512;
1450                         sd->sd_size = size;
1451                         if (disk == NULL)
1452                                 continue;
1453                         TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
1454                         g_raid_change_disk_state(disk,
1455                             G_RAID_DISK_S_ACTIVE);
1456                         g_raid_change_subdisk_state(sd,
1457                             G_RAID_SUBDISK_S_ACTIVE);
1458                         g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
1459                             G_RAID_EVENT_SUBDISK);
1460                 }
1461
1462                 /* Write metadata based on created entities. */
1463                 G_RAID_DEBUG1(0, sc, "Array started.");
1464                 g_raid_md_write_promise(md, vol, NULL, NULL);
1465
1466                 /* Pickup any STALE/SPARE disks to refill array if needed. */
1467                 g_raid_md_promise_refill(sc);
1468
1469                 g_raid_event_send(vol, G_RAID_VOLUME_E_START,
1470                     G_RAID_EVENT_VOLUME);
1471                 return (0);
1472         }
1473         if (strcmp(verb, "add") == 0) {
1474
1475                 gctl_error(req, "`add` command is not applicable, "
1476                     "use `label` instead.");
1477                 return (-99);
1478         }
1479         if (strcmp(verb, "delete") == 0) {
1480
1481                 /* Full node destruction. */
1482                 if (*nargs == 1) {
1483                         /* Check if some volume is still open. */
1484                         force = gctl_get_paraml(req, "force", sizeof(*force));
1485                         if (force != NULL && *force == 0 &&
1486                             g_raid_nopens(sc) != 0) {
1487                                 gctl_error(req, "Some volume is still open.");
1488                                 return (-4);
1489                         }
1490
1491                         TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1492                                 if (disk->d_consumer)
1493                                         promise_meta_erase(disk->d_consumer);
1494                         }
1495                         g_raid_destroy_node(sc, 0);
1496                         return (0);
1497                 }
1498
1499                 /* Destroy specified volume. If it was last - all node. */
1500                 if (*nargs != 2) {
1501                         gctl_error(req, "Invalid number of arguments.");
1502                         return (-1);
1503                 }
1504                 volname = gctl_get_asciiparam(req, "arg1");
1505                 if (volname == NULL) {
1506                         gctl_error(req, "No volume name.");
1507                         return (-2);
1508                 }
1509
1510                 /* Search for volume. */
1511                 TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1512                         if (strcmp(vol->v_name, volname) == 0)
1513                                 break;
1514                 }
1515                 if (vol == NULL) {
1516                         i = strtol(volname, &tmp, 10);
1517                         if (verb != volname && tmp[0] == 0) {
1518                                 TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1519                                         if (vol->v_global_id == i)
1520                                                 break;
1521                                 }
1522                         }
1523                 }
1524                 if (vol == NULL) {
1525                         gctl_error(req, "Volume '%s' not found.", volname);
1526                         return (-3);
1527                 }
1528
1529                 /* Check if volume is still open. */
1530                 force = gctl_get_paraml(req, "force", sizeof(*force));
1531                 if (force != NULL && *force == 0 &&
1532                     vol->v_provider_open != 0) {
1533                         gctl_error(req, "Volume is still open.");
1534                         return (-4);
1535                 }
1536
1537                 /* Destroy volume and potentially node. */
1538                 i = 0;
1539                 TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next)
1540                         i++;
1541                 if (i >= 2) {
1542                         g_raid_destroy_volume(vol);
1543                         g_raid_md_promise_purge_disks(sc);
1544                         g_raid_md_write_promise(md, NULL, NULL, NULL);
1545                 } else {
1546                         TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1547                                 if (disk->d_consumer)
1548                                         promise_meta_erase(disk->d_consumer);
1549                         }
1550                         g_raid_destroy_node(sc, 0);
1551                 }
1552                 return (0);
1553         }
1554         if (strcmp(verb, "remove") == 0 ||
1555             strcmp(verb, "fail") == 0) {
1556                 if (*nargs < 2) {
1557                         gctl_error(req, "Invalid number of arguments.");
1558                         return (-1);
1559                 }
1560                 for (i = 1; i < *nargs; i++) {
1561                         snprintf(arg, sizeof(arg), "arg%d", i);
1562                         diskname = gctl_get_asciiparam(req, arg);
1563                         if (diskname == NULL) {
1564                                 gctl_error(req, "No disk name (%s).", arg);
1565                                 error = -2;
1566                                 break;
1567                         }
1568                         if (strncmp(diskname, "/dev/", 5) == 0)
1569                                 diskname += 5;
1570
1571                         TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1572                                 if (disk->d_consumer != NULL && 
1573                                     disk->d_consumer->provider != NULL &&
1574                                     strcmp(disk->d_consumer->provider->name,
1575                                      diskname) == 0)
1576                                         break;
1577                         }
1578                         if (disk == NULL) {
1579                                 gctl_error(req, "Disk '%s' not found.",
1580                                     diskname);
1581                                 error = -3;
1582                                 break;
1583                         }
1584
1585                         if (strcmp(verb, "fail") == 0) {
1586                                 g_raid_md_fail_disk_promise(md, NULL, disk);
1587                                 continue;
1588                         }
1589
1590                         /* Erase metadata on deleting disk and destroy it. */
1591                         promise_meta_erase(disk->d_consumer);
1592                         g_raid_destroy_disk(disk);
1593                 }
1594                 g_raid_md_promise_purge_volumes(sc);
1595
1596                 /* Write updated metadata to remaining disks. */
1597                 g_raid_md_write_promise(md, NULL, NULL, NULL);
1598
1599                 /* Check if anything left. */
1600                 if (g_raid_ndisks(sc, -1) == 0)
1601                         g_raid_destroy_node(sc, 0);
1602                 else
1603                         g_raid_md_promise_refill(sc);
1604                 return (error);
1605         }
1606         if (strcmp(verb, "insert") == 0) {
1607                 if (*nargs < 2) {
1608                         gctl_error(req, "Invalid number of arguments.");
1609                         return (-1);
1610                 }
1611                 for (i = 1; i < *nargs; i++) {
1612                         /* Get disk name. */
1613                         snprintf(arg, sizeof(arg), "arg%d", i);
1614                         diskname = gctl_get_asciiparam(req, arg);
1615                         if (diskname == NULL) {
1616                                 gctl_error(req, "No disk name (%s).", arg);
1617                                 error = -3;
1618                                 break;
1619                         }
1620
1621                         /* Try to find provider with specified name. */
1622                         g_topology_lock();
1623                         cp = g_raid_open_consumer(sc, diskname);
1624                         if (cp == NULL) {
1625                                 gctl_error(req, "Can't open disk '%s'.",
1626                                     diskname);
1627                                 g_topology_unlock();
1628                                 error = -4;
1629                                 break;
1630                         }
1631                         pp = cp->provider;
1632                         g_topology_unlock();
1633
1634                         if (pp->mediasize / pp->sectorsize > UINT32_MAX) {
1635                                 gctl_error(req,
1636                                     "Disk '%s' is too big.", diskname);
1637                                 g_raid_kill_consumer(sc, cp);
1638                                 error = -8;
1639                                 break;
1640                         }
1641
1642                         pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO);
1643
1644                         disk = g_raid_create_disk(sc);
1645                         disk->d_consumer = cp;
1646                         disk->d_md_data = (void *)pd;
1647                         cp->private = disk;
1648
1649                         /* Read kernel dumping information. */
1650                         disk->d_kd.offset = 0;
1651                         disk->d_kd.length = OFF_MAX;
1652                         len = sizeof(disk->d_kd);
1653                         g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd);
1654                         if (disk->d_kd.di.dumper == NULL)
1655                                 G_RAID_DEBUG1(2, sc,
1656                                     "Dumping not supported by %s.",
1657                                     cp->provider->name);
1658
1659                         /* Welcome the "new" disk. */
1660                         g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
1661                         promise_meta_write_spare(cp);
1662                         g_raid_md_promise_refill(sc);
1663                 }
1664                 return (error);
1665         }
1666         return (-100);
1667 }
1668
1669 static int
1670 g_raid_md_write_promise(struct g_raid_md_object *md, struct g_raid_volume *tvol,
1671     struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
1672 {
1673         struct g_raid_softc *sc;
1674         struct g_raid_volume *vol;
1675         struct g_raid_subdisk *sd;
1676         struct g_raid_disk *disk;
1677         struct g_raid_md_promise_perdisk *pd;
1678         struct g_raid_md_promise_pervolume *pv;
1679         struct promise_raid_conf *meta;
1680         off_t rebuild_lba64;
1681         int i, j, pos, rebuild;
1682
1683         sc = md->mdo_softc;
1684
1685         if (sc->sc_stopping == G_RAID_DESTROY_HARD)
1686                 return (0);
1687
1688         /* Generate new per-volume metadata for affected volumes. */
1689         TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1690                 if (vol->v_stopping)
1691                         continue;
1692
1693                 /* Skip volumes not related to specified targets. */
1694                 if (tvol != NULL && vol != tvol)
1695                         continue;
1696                 if (tsd != NULL && vol != tsd->sd_volume)
1697                         continue;
1698                 if (tdisk != NULL) {
1699                         for (i = 0; i < vol->v_disks_count; i++) {
1700                                 if (vol->v_subdisks[i].sd_disk == tdisk)
1701                                         break;
1702                         }
1703                         if (i >= vol->v_disks_count)
1704                                 continue;
1705                 }
1706
1707                 pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data;
1708                 pv->pv_generation++;
1709
1710                 meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK | M_ZERO);
1711                 if (pv->pv_meta != NULL)
1712                         memcpy(meta, pv->pv_meta, sizeof(*meta));
1713                 memcpy(meta->promise_id, PROMISE_MAGIC,
1714                     sizeof(PROMISE_MAGIC) - 1);
1715                 meta->dummy_0 = 0x00020000;
1716                 meta->integrity = PROMISE_I_VALID;
1717
1718                 meta->generation = pv->pv_generation;
1719                 meta->status = PROMISE_S_VALID | PROMISE_S_ONLINE |
1720                     PROMISE_S_INITED | PROMISE_S_READY;
1721                 if (vol->v_state <= G_RAID_VOLUME_S_DEGRADED)
1722                         meta->status |= PROMISE_S_DEGRADED;
1723                 if (vol->v_dirty)
1724                         meta->status |= PROMISE_S_MARKED; /* XXX: INVENTED! */
1725                 if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0 ||
1726                     vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE)
1727                         meta->type = PROMISE_T_RAID0;
1728                 else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
1729                     vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
1730                         meta->type = PROMISE_T_RAID1;
1731                 else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3)
1732                         meta->type = PROMISE_T_RAID3;
1733                 else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5)
1734                         meta->type = PROMISE_T_RAID5;
1735                 else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT)
1736                         meta->type = PROMISE_T_SPAN;
1737                 else
1738                         meta->type = PROMISE_T_JBOD;
1739                 meta->total_disks = vol->v_disks_count;
1740                 meta->stripe_shift = ffs(vol->v_strip_size / 1024);
1741                 meta->array_width = vol->v_disks_count;
1742                 if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
1743                     vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
1744                         meta->array_width /= 2;
1745                 meta->array_number = vol->v_global_id;
1746                 meta->total_sectors = vol->v_mediasize / vol->v_sectorsize;
1747                 meta->total_sectors_high =
1748                     (vol->v_mediasize / vol->v_sectorsize) >> 32;
1749                 meta->cylinders = meta->total_sectors / (255 * 63) - 1;
1750                 meta->heads = 254;
1751                 meta->sectors = 63;
1752                 meta->volume_id = pv->pv_id;
1753                 rebuild_lba64 = UINT64_MAX;
1754                 rebuild = 0;
1755                 for (i = 0; i < vol->v_disks_count; i++) {
1756                         sd = &vol->v_subdisks[i];
1757                         /* For RAID0+1 we need to translate order. */
1758                         pos = promise_meta_translate_disk(vol, i);
1759                         meta->disks[pos].flags = PROMISE_F_VALID |
1760                             PROMISE_F_ASSIGNED;
1761                         if (sd->sd_state == G_RAID_SUBDISK_S_NONE) {
1762                                 meta->disks[pos].flags |= 0;
1763                         } else if (sd->sd_state == G_RAID_SUBDISK_S_FAILED) {
1764                                 meta->disks[pos].flags |=
1765                                     PROMISE_F_DOWN | PROMISE_F_REDIR;
1766                         } else if (sd->sd_state <= G_RAID_SUBDISK_S_REBUILD) {
1767                                 meta->disks[pos].flags |=
1768                                     PROMISE_F_ONLINE | PROMISE_F_REDIR;
1769                                 if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) {
1770                                         rebuild_lba64 = MIN(rebuild_lba64,
1771                                             sd->sd_rebuild_pos / 512);
1772                                 } else
1773                                         rebuild_lba64 = 0;
1774                                 rebuild = 1;
1775                         } else {
1776                                 meta->disks[pos].flags |= PROMISE_F_ONLINE;
1777                                 if (sd->sd_state < G_RAID_SUBDISK_S_ACTIVE) {
1778                                         meta->status |= PROMISE_S_MARKED;
1779                                         if (sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
1780                                                 rebuild_lba64 = MIN(rebuild_lba64,
1781                                                     sd->sd_rebuild_pos / 512);
1782                                         } else
1783                                                 rebuild_lba64 = 0;
1784                                 }
1785                         }
1786                         if (pv->pv_meta != NULL) {
1787                                 meta->disks[pos].id = pv->pv_meta->disks[pos].id;
1788                         } else {
1789                                 meta->disks[pos].number = i * 2;
1790                                 arc4rand(&meta->disks[pos].id,
1791                                     sizeof(meta->disks[pos].id), 0);
1792                         }
1793                 }
1794                 promise_meta_put_name(meta, vol->v_name);
1795
1796                 /* Try to mimic AMD BIOS rebuild/resync behavior. */
1797                 if (rebuild_lba64 != UINT64_MAX) {
1798                         if (rebuild)
1799                                 meta->magic_3 = 0x03040010UL; /* Rebuild? */
1800                         else
1801                                 meta->magic_3 = 0x03040008UL; /* Resync? */
1802                         /* Translate from per-disk to per-volume LBA. */
1803                         if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
1804                             vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) {
1805                                 rebuild_lba64 *= meta->array_width;
1806                         } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 ||
1807                             vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) {
1808                                 rebuild_lba64 *= meta->array_width - 1;
1809                         } else
1810                                 rebuild_lba64 = 0;
1811                 } else
1812                         meta->magic_3 = 0x03000000UL;
1813                 meta->rebuild_lba64 = rebuild_lba64;
1814                 meta->magic_4 = 0x04010101UL;
1815
1816                 /* Replace per-volume metadata with new. */
1817                 if (pv->pv_meta != NULL)
1818                         free(pv->pv_meta, M_MD_PROMISE);
1819                 pv->pv_meta = meta;
1820
1821                 /* Copy new metadata to the disks, adding or replacing old. */
1822                 for (i = 0; i < vol->v_disks_count; i++) {
1823                         sd = &vol->v_subdisks[i];
1824                         disk = sd->sd_disk;
1825                         if (disk == NULL)
1826                                 continue;
1827                         /* For RAID0+1 we need to translate order. */
1828                         pos = promise_meta_translate_disk(vol, i);
1829                         pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
1830                         for (j = 0; j < pd->pd_subdisks; j++) {
1831                                 if (pd->pd_meta[j]->volume_id == meta->volume_id)
1832                                         break;
1833                         }
1834                         if (j == pd->pd_subdisks)
1835                                 pd->pd_subdisks++;
1836                         if (pd->pd_meta[j] != NULL)
1837                                 free(pd->pd_meta[j], M_MD_PROMISE);
1838                         pd->pd_meta[j] = promise_meta_copy(meta);
1839                         pd->pd_meta[j]->disk = meta->disks[pos];
1840                         pd->pd_meta[j]->disk.number = pos;
1841                         pd->pd_meta[j]->disk_offset = sd->sd_offset / 512;
1842                         pd->pd_meta[j]->disk_sectors = sd->sd_size / 512;
1843                         if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) {
1844                                 pd->pd_meta[j]->rebuild_lba =
1845                                     sd->sd_rebuild_pos / 512;
1846                         } else if (sd->sd_state < G_RAID_SUBDISK_S_REBUILD)
1847                                 pd->pd_meta[j]->rebuild_lba = 0;
1848                         else
1849                                 pd->pd_meta[j]->rebuild_lba = UINT32_MAX;
1850                         pd->pd_updated = 1;
1851                 }
1852         }
1853
1854         TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1855                 pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
1856                 if (disk->d_state != G_RAID_DISK_S_ACTIVE)
1857                         continue;
1858                 if (!pd->pd_updated)
1859                         continue;
1860                 G_RAID_DEBUG(1, "Writing Promise metadata to %s",
1861                     g_raid_get_diskname(disk));
1862                 for (i = 0; i < pd->pd_subdisks; i++)
1863                         g_raid_md_promise_print(pd->pd_meta[i]);
1864                 promise_meta_write(disk->d_consumer,
1865                     pd->pd_meta, pd->pd_subdisks);
1866                 pd->pd_updated = 0;
1867         }
1868
1869         return (0);
1870 }
1871
1872 static int
1873 g_raid_md_fail_disk_promise(struct g_raid_md_object *md,
1874     struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
1875 {
1876         struct g_raid_softc *sc;
1877         struct g_raid_md_promise_perdisk *pd;
1878         struct g_raid_subdisk *sd;
1879         int i, pos;
1880
1881         sc = md->mdo_softc;
1882         pd = (struct g_raid_md_promise_perdisk *)tdisk->d_md_data;
1883
1884         /* We can't fail disk that is not a part of array now. */
1885         if (tdisk->d_state != G_RAID_DISK_S_ACTIVE)
1886                 return (-1);
1887
1888         /*
1889          * Mark disk as failed in metadata and try to write that metadata
1890          * to the disk itself to prevent it's later resurrection as STALE.
1891          */
1892         if (pd->pd_subdisks > 0 && tdisk->d_consumer != NULL)
1893                 G_RAID_DEBUG(1, "Writing Promise metadata to %s",
1894                     g_raid_get_diskname(tdisk));
1895         for (i = 0; i < pd->pd_subdisks; i++) {
1896                 pd->pd_meta[i]->disk.flags |=
1897                     PROMISE_F_DOWN | PROMISE_F_REDIR;
1898                 pos = pd->pd_meta[i]->disk.number;
1899                 if (pos >= 0 && pos < PROMISE_MAX_DISKS) {
1900                         pd->pd_meta[i]->disks[pos].flags |=
1901                             PROMISE_F_DOWN | PROMISE_F_REDIR;
1902                 }
1903                 g_raid_md_promise_print(pd->pd_meta[i]);
1904         }
1905         if (tdisk->d_consumer != NULL)
1906                 promise_meta_write(tdisk->d_consumer,
1907                     pd->pd_meta, pd->pd_subdisks);
1908
1909         /* Change states. */
1910         g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED);
1911         TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) {
1912                 g_raid_change_subdisk_state(sd,
1913                     G_RAID_SUBDISK_S_FAILED);
1914                 g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED,
1915                     G_RAID_EVENT_SUBDISK);
1916         }
1917
1918         /* Write updated metadata to remaining disks. */
1919         g_raid_md_write_promise(md, NULL, NULL, tdisk);
1920
1921         g_raid_md_promise_refill(sc);
1922         return (0);
1923 }
1924
1925 static int
1926 g_raid_md_free_disk_promise(struct g_raid_md_object *md,
1927     struct g_raid_disk *disk)
1928 {
1929         struct g_raid_md_promise_perdisk *pd;
1930         int i;
1931
1932         pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
1933         for (i = 0; i < pd->pd_subdisks; i++) {
1934                 if (pd->pd_meta[i] != NULL) {
1935                         free(pd->pd_meta[i], M_MD_PROMISE);
1936                         pd->pd_meta[i] = NULL;
1937                 }
1938         }
1939         free(pd, M_MD_PROMISE);
1940         disk->d_md_data = NULL;
1941         return (0);
1942 }
1943
1944 static int
1945 g_raid_md_free_volume_promise(struct g_raid_md_object *md,
1946     struct g_raid_volume *vol)
1947 {
1948         struct g_raid_md_promise_pervolume *pv;
1949
1950         pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data;
1951         if (pv && pv->pv_meta != NULL) {
1952                 free(pv->pv_meta, M_MD_PROMISE);
1953                 pv->pv_meta = NULL;
1954         }
1955         if (pv && !pv->pv_started) {
1956                 pv->pv_started = 1;
1957                 callout_stop(&pv->pv_start_co);
1958         }
1959         free(pv, M_MD_PROMISE);
1960         vol->v_md_data = NULL;
1961         return (0);
1962 }
1963
1964 static int
1965 g_raid_md_free_promise(struct g_raid_md_object *md)
1966 {
1967
1968         return (0);
1969 }
1970
1971 G_RAID_MD_DECLARE(promise, "Promise");