]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/geom/raid/md_promise.c
Merge libc++ trunk r366426, resolve conflicts, and add FREEBSD-Xlist.
[FreeBSD/FreeBSD.git] / sys / geom / raid / md_promise.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 Alexander Motin <mav@FreeBSD.org>
5  * Copyright (c) 2000 - 2008 Søren Schmidt <sos@FreeBSD.org>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32
33 #include <sys/param.h>
34 #include <sys/bio.h>
35 #include <sys/endian.h>
36 #include <sys/kernel.h>
37 #include <sys/kobj.h>
38 #include <sys/limits.h>
39 #include <sys/lock.h>
40 #include <sys/malloc.h>
41 #include <sys/mutex.h>
42 #include <sys/systm.h>
43 #include <geom/geom.h>
44 #include <geom/geom_dbg.h>
45 #include "geom/raid/g_raid.h"
46 #include "g_raid_md_if.h"
47
48 static MALLOC_DEFINE(M_MD_PROMISE, "md_promise_data", "GEOM_RAID Promise metadata");
49
50 #define PROMISE_MAX_DISKS       8
51 #define PROMISE_MAX_SUBDISKS    2
52 #define PROMISE_META_OFFSET     14
53
54 struct promise_raid_disk {
55         uint8_t         flags;                  /* Subdisk status. */
56 #define PROMISE_F_VALID         0x01
57 #define PROMISE_F_ONLINE        0x02
58 #define PROMISE_F_ASSIGNED      0x04
59 #define PROMISE_F_SPARE         0x08
60 #define PROMISE_F_DUPLICATE     0x10
61 #define PROMISE_F_REDIR         0x20
62 #define PROMISE_F_DOWN          0x40
63 #define PROMISE_F_READY         0x80
64
65         uint8_t         number;                 /* Position in a volume. */
66         uint8_t         channel;                /* ATA channel number. */
67         uint8_t         device;                 /* ATA device number. */
68         uint64_t        id __packed;            /* Subdisk ID. */
69 } __packed;
70
71 struct promise_raid_conf {
72         char            promise_id[24];
73 #define PROMISE_MAGIC           "Promise Technology, Inc."
74 #define FREEBSD_MAGIC           "FreeBSD ATA driver RAID "
75
76         uint32_t        dummy_0;
77         uint64_t        magic_0;
78 #define PROMISE_MAGIC0(x)       (((uint64_t)(x.channel) << 48) | \
79                                 ((uint64_t)(x.device != 0) << 56))
80         uint16_t        magic_1;
81         uint32_t        magic_2;
82         uint8_t         filler1[470];
83
84         uint32_t        integrity;
85 #define PROMISE_I_VALID         0x00000080
86
87         struct promise_raid_disk        disk;   /* This subdisk info. */
88         uint32_t        disk_offset;            /* Subdisk offset. */
89         uint32_t        disk_sectors;           /* Subdisk size */
90         uint32_t        disk_rebuild;           /* Rebuild position. */
91         uint16_t        generation;             /* Generation number. */
92         uint8_t         status;                 /* Volume status. */
93 #define PROMISE_S_VALID         0x01
94 #define PROMISE_S_ONLINE        0x02
95 #define PROMISE_S_INITED        0x04
96 #define PROMISE_S_READY         0x08
97 #define PROMISE_S_DEGRADED      0x10
98 #define PROMISE_S_MARKED        0x20
99 #define PROMISE_S_MIGRATING     0x40
100 #define PROMISE_S_FUNCTIONAL    0x80
101
102         uint8_t         type;                   /* Voluem type. */
103 #define PROMISE_T_RAID0         0x00
104 #define PROMISE_T_RAID1         0x01
105 #define PROMISE_T_RAID3         0x02
106 #define PROMISE_T_RAID5         0x04
107 #define PROMISE_T_SPAN          0x08
108 #define PROMISE_T_JBOD          0x10
109
110         uint8_t         total_disks;            /* Disks in this volume. */
111         uint8_t         stripe_shift;           /* Strip size. */
112         uint8_t         array_width;            /* Number of RAID0 stripes. */
113         uint8_t         array_number;           /* Global volume number. */
114         uint32_t        total_sectors;          /* Volume size. */
115         uint16_t        cylinders;              /* Volume geometry: C. */
116         uint8_t         heads;                  /* Volume geometry: H. */
117         uint8_t         sectors;                /* Volume geometry: S. */
118         uint64_t        volume_id __packed;     /* Volume ID, */
119         struct promise_raid_disk        disks[PROMISE_MAX_DISKS];
120                                                 /* Subdisks in this volume. */
121         char            name[32];               /* Volume label. */
122
123         uint32_t        filler2[8];
124         uint32_t        magic_3;        /* Something related to rebuild. */
125         uint64_t        rebuild_lba64;  /* Per-volume rebuild position. */
126         uint32_t        magic_4;
127         uint32_t        magic_5;
128         uint32_t        total_sectors_high;
129         uint8_t         magic_6;
130         uint8_t         sector_size;
131         uint16_t        magic_7;
132         uint32_t        magic_8[31];
133         uint32_t        backup_time;
134         uint16_t        magic_9;
135         uint32_t        disk_offset_high;
136         uint32_t        disk_sectors_high;
137         uint32_t        disk_rebuild_high;
138         uint16_t        magic_10;
139         uint32_t        magic_11[3];
140         uint32_t        filler3[284];
141         uint32_t        checksum;
142 } __packed;
143
144 struct g_raid_md_promise_perdisk {
145         int              pd_updated;
146         int              pd_subdisks;
147         struct promise_raid_conf        *pd_meta[PROMISE_MAX_SUBDISKS];
148 };
149
150 struct g_raid_md_promise_pervolume {
151         struct promise_raid_conf        *pv_meta;
152         uint64_t                         pv_id;
153         uint16_t                         pv_generation;
154         int                              pv_disks_present;
155         int                              pv_started;
156         struct callout                   pv_start_co;   /* STARTING state timer. */
157 };
158
159 static g_raid_md_create_t g_raid_md_create_promise;
160 static g_raid_md_taste_t g_raid_md_taste_promise;
161 static g_raid_md_event_t g_raid_md_event_promise;
162 static g_raid_md_volume_event_t g_raid_md_volume_event_promise;
163 static g_raid_md_ctl_t g_raid_md_ctl_promise;
164 static g_raid_md_write_t g_raid_md_write_promise;
165 static g_raid_md_fail_disk_t g_raid_md_fail_disk_promise;
166 static g_raid_md_free_disk_t g_raid_md_free_disk_promise;
167 static g_raid_md_free_volume_t g_raid_md_free_volume_promise;
168 static g_raid_md_free_t g_raid_md_free_promise;
169
170 static kobj_method_t g_raid_md_promise_methods[] = {
171         KOBJMETHOD(g_raid_md_create,    g_raid_md_create_promise),
172         KOBJMETHOD(g_raid_md_taste,     g_raid_md_taste_promise),
173         KOBJMETHOD(g_raid_md_event,     g_raid_md_event_promise),
174         KOBJMETHOD(g_raid_md_volume_event,      g_raid_md_volume_event_promise),
175         KOBJMETHOD(g_raid_md_ctl,       g_raid_md_ctl_promise),
176         KOBJMETHOD(g_raid_md_write,     g_raid_md_write_promise),
177         KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_promise),
178         KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_promise),
179         KOBJMETHOD(g_raid_md_free_volume,       g_raid_md_free_volume_promise),
180         KOBJMETHOD(g_raid_md_free,      g_raid_md_free_promise),
181         { 0, 0 }
182 };
183
184 static struct g_raid_md_class g_raid_md_promise_class = {
185         "Promise",
186         g_raid_md_promise_methods,
187         sizeof(struct g_raid_md_object),
188         .mdc_enable = 1,
189         .mdc_priority = 100
190 };
191
192
193 static void
194 g_raid_md_promise_print(struct promise_raid_conf *meta)
195 {
196         int i;
197
198         if (g_raid_debug < 1)
199                 return;
200
201         printf("********* ATA Promise Metadata *********\n");
202         printf("promise_id          <%.24s>\n", meta->promise_id);
203         printf("disk                %02x %02x %02x %02x %016jx\n",
204             meta->disk.flags, meta->disk.number, meta->disk.channel,
205             meta->disk.device, meta->disk.id);
206         printf("disk_offset         %u\n", meta->disk_offset);
207         printf("disk_sectors        %u\n", meta->disk_sectors);
208         printf("disk_rebuild        %u\n", meta->disk_rebuild);
209         printf("generation          %u\n", meta->generation);
210         printf("status              0x%02x\n", meta->status);
211         printf("type                %u\n", meta->type);
212         printf("total_disks         %u\n", meta->total_disks);
213         printf("stripe_shift        %u\n", meta->stripe_shift);
214         printf("array_width         %u\n", meta->array_width);
215         printf("array_number        %u\n", meta->array_number);
216         printf("total_sectors       %u\n", meta->total_sectors);
217         printf("cylinders           %u\n", meta->cylinders);
218         printf("heads               %u\n", meta->heads);
219         printf("sectors             %u\n", meta->sectors);
220         printf("volume_id           0x%016jx\n", meta->volume_id);
221         printf("disks:\n");
222         for (i = 0; i < PROMISE_MAX_DISKS; i++ ) {
223                 printf("                    %02x %02x %02x %02x %016jx\n",
224                     meta->disks[i].flags, meta->disks[i].number,
225                     meta->disks[i].channel, meta->disks[i].device,
226                     meta->disks[i].id);
227         }
228         printf("name                <%.32s>\n", meta->name);
229         printf("magic_3             0x%08x\n", meta->magic_3);
230         printf("rebuild_lba64       %ju\n", meta->rebuild_lba64);
231         printf("magic_4             0x%08x\n", meta->magic_4);
232         printf("magic_5             0x%08x\n", meta->magic_5);
233         printf("total_sectors_high  0x%08x\n", meta->total_sectors_high);
234         printf("sector_size         %u\n", meta->sector_size);
235         printf("backup_time         %d\n", meta->backup_time);
236         printf("disk_offset_high    0x%08x\n", meta->disk_offset_high);
237         printf("disk_sectors_high   0x%08x\n", meta->disk_sectors_high);
238         printf("disk_rebuild_high   0x%08x\n", meta->disk_rebuild_high);
239         printf("=================================================\n");
240 }
241
242 static struct promise_raid_conf *
243 promise_meta_copy(struct promise_raid_conf *meta)
244 {
245         struct promise_raid_conf *nmeta;
246
247         nmeta = malloc(sizeof(*nmeta), M_MD_PROMISE, M_WAITOK);
248         memcpy(nmeta, meta, sizeof(*nmeta));
249         return (nmeta);
250 }
251
252 static int
253 promise_meta_find_disk(struct promise_raid_conf *meta, uint64_t id)
254 {
255         int pos;
256
257         for (pos = 0; pos < meta->total_disks; pos++) {
258                 if (meta->disks[pos].id == id)
259                         return (pos);
260         }
261         return (-1);
262 }
263
264 static int
265 promise_meta_unused_range(struct promise_raid_conf **metaarr, int nsd,
266     off_t sectors, off_t *off, off_t *size)
267 {
268         off_t coff, csize, tmp;
269         int i, j;
270
271         sectors -= 131072;
272         *off = 0;
273         *size = 0;
274         coff = 0;
275         csize = sectors;
276         i = 0;
277         while (1) {
278                 for (j = 0; j < nsd; j++) {
279                         tmp = ((off_t)metaarr[j]->disk_offset_high << 32) +
280                             metaarr[j]->disk_offset;
281                         if (tmp >= coff)
282                                 csize = MIN(csize, tmp - coff);
283                 }
284                 if (csize > *size) {
285                         *off = coff;
286                         *size = csize;
287                 }
288                 if (i >= nsd)
289                         break;
290                 coff = ((off_t)metaarr[i]->disk_offset_high << 32) +
291                      metaarr[i]->disk_offset +
292                     ((off_t)metaarr[i]->disk_sectors_high << 32) +
293                      metaarr[i]->disk_sectors;
294                 csize = sectors - coff;
295                 i++;
296         }
297         return ((*size > 0) ? 1 : 0);
298 }
299
300 static int
301 promise_meta_translate_disk(struct g_raid_volume *vol, int md_disk_pos)
302 {
303         int disk_pos, width;
304
305         if (md_disk_pos >= 0 && vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) {
306                 width = vol->v_disks_count / 2;
307                 disk_pos = (md_disk_pos / width) +
308                     (md_disk_pos % width) * width;
309         } else
310                 disk_pos = md_disk_pos;
311         return (disk_pos);
312 }
313
314 static void
315 promise_meta_get_name(struct promise_raid_conf *meta, char *buf)
316 {
317         int i;
318
319         strncpy(buf, meta->name, 32);
320         buf[32] = 0;
321         for (i = 31; i >= 0; i--) {
322                 if (buf[i] > 0x20)
323                         break;
324                 buf[i] = 0;
325         }
326 }
327
328 static void
329 promise_meta_put_name(struct promise_raid_conf *meta, char *buf)
330 {
331
332         memset(meta->name, 0x20, 32);
333         memcpy(meta->name, buf, MIN(strlen(buf), 32));
334 }
335
336 static int
337 promise_meta_read(struct g_consumer *cp, struct promise_raid_conf **metaarr)
338 {
339         struct g_provider *pp;
340         struct promise_raid_conf *meta;
341         char *buf;
342         int error, i, subdisks;
343         uint32_t checksum, *ptr;
344
345         pp = cp->provider;
346         subdisks = 0;
347
348         if (pp->sectorsize * 4 > MAXPHYS) {
349                 G_RAID_DEBUG(1, "%s: Blocksize is too big.", pp->name);
350                 return (subdisks);
351         }
352 next:
353         /* Read metadata block. */
354         buf = g_read_data(cp, pp->mediasize - pp->sectorsize *
355             (63 - subdisks * PROMISE_META_OFFSET),
356             pp->sectorsize * 4, &error);
357         if (buf == NULL) {
358                 G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).",
359                     pp->name, error);
360                 return (subdisks);
361         }
362         meta = (struct promise_raid_conf *)buf;
363
364         /* Check if this is an Promise RAID struct */
365         if (strncmp(meta->promise_id, PROMISE_MAGIC, strlen(PROMISE_MAGIC)) &&
366             strncmp(meta->promise_id, FREEBSD_MAGIC, strlen(FREEBSD_MAGIC))) {
367                 if (subdisks == 0)
368                         G_RAID_DEBUG(1,
369                             "Promise signature check failed on %s", pp->name);
370                 g_free(buf);
371                 return (subdisks);
372         }
373         meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK);
374         memcpy(meta, buf, MIN(sizeof(*meta), pp->sectorsize * 4));
375         g_free(buf);
376
377         /* Check metadata checksum. */
378         for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++)
379                 checksum += *ptr++;
380         if (checksum != meta->checksum) {
381                 G_RAID_DEBUG(1, "Promise checksum check failed on %s", pp->name);
382                 free(meta, M_MD_PROMISE);
383                 return (subdisks);
384         }
385
386         if ((meta->integrity & PROMISE_I_VALID) == 0) {
387                 G_RAID_DEBUG(1, "Promise metadata is invalid on %s", pp->name);
388                 free(meta, M_MD_PROMISE);
389                 return (subdisks);
390         }
391
392         if (meta->total_disks > PROMISE_MAX_DISKS) {
393                 G_RAID_DEBUG(1, "Wrong number of disks on %s (%d)",
394                     pp->name, meta->total_disks);
395                 free(meta, M_MD_PROMISE);
396                 return (subdisks);
397         }
398
399         /* Remove filler garbage from fields used in newer metadata. */
400         if (meta->disk_offset_high == 0x8b8c8d8e &&
401             meta->disk_sectors_high == 0x8788898a &&
402             meta->disk_rebuild_high == 0x83848586) {
403                 meta->disk_offset_high = 0;
404                 meta->disk_sectors_high = 0;
405                 if (meta->disk_rebuild == UINT32_MAX)
406                         meta->disk_rebuild_high = UINT32_MAX;
407                 else
408                         meta->disk_rebuild_high = 0;
409                 if (meta->total_sectors_high == 0x15161718) {
410                         meta->total_sectors_high = 0;
411                         meta->backup_time = 0;
412                         if (meta->rebuild_lba64 == 0x2122232425262728)
413                                 meta->rebuild_lba64 = UINT64_MAX;
414                 }
415         }
416         if (meta->sector_size < 1 || meta->sector_size > 8)
417                 meta->sector_size = 1;
418
419         /* Save this part and look for next. */
420         *metaarr = meta;
421         metaarr++;
422         subdisks++;
423         if (subdisks < PROMISE_MAX_SUBDISKS)
424                 goto next;
425
426         return (subdisks);
427 }
428
429 static int
430 promise_meta_write(struct g_consumer *cp,
431     struct promise_raid_conf **metaarr, int nsd)
432 {
433         struct g_provider *pp;
434         struct promise_raid_conf *meta;
435         char *buf;
436         off_t off, size;
437         int error, i, subdisk, fake;
438         uint32_t checksum, *ptr;
439
440         pp = cp->provider;
441         subdisk = 0;
442         fake = 0;
443 next:
444         buf = malloc(pp->sectorsize * 4, M_MD_PROMISE, M_WAITOK | M_ZERO);
445         meta = NULL;
446         if (subdisk < nsd) {
447                 meta = metaarr[subdisk];
448         } else if (!fake && promise_meta_unused_range(metaarr, nsd,
449             cp->provider->mediasize / cp->provider->sectorsize,
450             &off, &size)) {
451                 /* Optionally add record for unused space. */
452                 meta = (struct promise_raid_conf *)buf;
453                 memcpy(&meta->promise_id[0], PROMISE_MAGIC,
454                     sizeof(PROMISE_MAGIC) - 1);
455                 meta->dummy_0 = 0x00020000;
456                 meta->integrity = PROMISE_I_VALID;
457                 meta->disk.flags = PROMISE_F_ONLINE | PROMISE_F_VALID;
458                 meta->disk.number = 0xff;
459                 arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0);
460                 meta->disk_offset_high = off >> 32;
461                 meta->disk_offset = (uint32_t)off;
462                 meta->disk_sectors_high = size >> 32;
463                 meta->disk_sectors = (uint32_t)size;
464                 meta->disk_rebuild_high = UINT32_MAX;
465                 meta->disk_rebuild = UINT32_MAX;
466                 fake = 1;
467         }
468         if (meta != NULL) {
469                 /* Recalculate checksum for case if metadata were changed. */
470                 meta->checksum = 0;
471                 for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++)
472                         checksum += *ptr++;
473                 meta->checksum = checksum;
474                 memcpy(buf, meta, MIN(pp->sectorsize * 4, sizeof(*meta)));
475         }
476         error = g_write_data(cp, pp->mediasize - pp->sectorsize *
477             (63 - subdisk * PROMISE_META_OFFSET),
478             buf, pp->sectorsize * 4);
479         if (error != 0) {
480                 G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).",
481                     pp->name, error);
482         }
483         free(buf, M_MD_PROMISE);
484
485         subdisk++;
486         if (subdisk < PROMISE_MAX_SUBDISKS)
487                 goto next;
488
489         return (error);
490 }
491
492 static int
493 promise_meta_erase(struct g_consumer *cp)
494 {
495         struct g_provider *pp;
496         char *buf;
497         int error, subdisk;
498
499         pp = cp->provider;
500         buf = malloc(4 * pp->sectorsize, M_MD_PROMISE, M_WAITOK | M_ZERO);
501         for (subdisk = 0; subdisk < PROMISE_MAX_SUBDISKS; subdisk++) {
502                 error = g_write_data(cp, pp->mediasize - pp->sectorsize *
503                     (63 - subdisk * PROMISE_META_OFFSET),
504                     buf, 4 * pp->sectorsize);
505                 if (error != 0) {
506                         G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).",
507                             pp->name, error);
508                 }
509         }
510         free(buf, M_MD_PROMISE);
511         return (error);
512 }
513
514 static int
515 promise_meta_write_spare(struct g_consumer *cp)
516 {
517         struct promise_raid_conf *meta;
518         off_t tmp;
519         int error;
520
521         meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK | M_ZERO);
522         memcpy(&meta->promise_id[0], PROMISE_MAGIC, sizeof(PROMISE_MAGIC) - 1);
523         meta->dummy_0 = 0x00020000;
524         meta->integrity = PROMISE_I_VALID;
525         meta->disk.flags = PROMISE_F_SPARE | PROMISE_F_ONLINE | PROMISE_F_VALID;
526         meta->disk.number = 0xff;
527         arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0);
528         tmp = cp->provider->mediasize / cp->provider->sectorsize - 131072;
529         meta->disk_sectors_high = tmp >> 32;
530         meta->disk_sectors = (uint32_t)tmp;
531         meta->disk_rebuild_high = UINT32_MAX;
532         meta->disk_rebuild = UINT32_MAX;
533         error = promise_meta_write(cp, &meta, 1);
534         free(meta, M_MD_PROMISE);
535         return (error);
536 }
537
538 static struct g_raid_volume *
539 g_raid_md_promise_get_volume(struct g_raid_softc *sc, uint64_t id)
540 {
541         struct g_raid_volume    *vol;
542         struct g_raid_md_promise_pervolume *pv;
543
544         TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
545                 pv = vol->v_md_data;
546                 if (pv->pv_id == id)
547                         break;
548         }
549         return (vol);
550 }
551
552 static int
553 g_raid_md_promise_purge_volumes(struct g_raid_softc *sc)
554 {
555         struct g_raid_volume    *vol, *tvol;
556         struct g_raid_md_promise_pervolume *pv;
557         int i, res;
558
559         res = 0;
560         TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tvol) {
561                 pv = vol->v_md_data;
562                 if (!pv->pv_started || vol->v_stopping)
563                         continue;
564                 for (i = 0; i < vol->v_disks_count; i++) {
565                         if (vol->v_subdisks[i].sd_state != G_RAID_SUBDISK_S_NONE)
566                                 break;
567                 }
568                 if (i >= vol->v_disks_count) {
569                         g_raid_destroy_volume(vol);
570                         res = 1;
571                 }
572         }
573         return (res);
574 }
575
576 static int
577 g_raid_md_promise_purge_disks(struct g_raid_softc *sc)
578 {
579         struct g_raid_disk      *disk, *tdisk;
580         struct g_raid_volume    *vol;
581         struct g_raid_md_promise_perdisk *pd;
582         int i, j, res;
583
584         res = 0;
585         TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) {
586                 if (disk->d_state == G_RAID_DISK_S_SPARE)
587                         continue;
588                 pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
589
590                 /* Scan for deleted volumes. */
591                 for (i = 0; i < pd->pd_subdisks; ) {
592                         vol = g_raid_md_promise_get_volume(sc,
593                             pd->pd_meta[i]->volume_id);
594                         if (vol != NULL && !vol->v_stopping) {
595                                 i++;
596                                 continue;
597                         }
598                         free(pd->pd_meta[i], M_MD_PROMISE);
599                         for (j = i; j < pd->pd_subdisks - 1; j++)
600                                 pd->pd_meta[j] = pd->pd_meta[j + 1];
601                         pd->pd_meta[pd->pd_subdisks - 1] = NULL;
602                         pd->pd_subdisks--;
603                         pd->pd_updated = 1;
604                 }
605
606                 /* If there is no metadata left - erase and delete disk. */
607                 if (pd->pd_subdisks == 0) {
608                         promise_meta_erase(disk->d_consumer);
609                         g_raid_destroy_disk(disk);
610                         res = 1;
611                 }
612         }
613         return (res);
614 }
615
616 static int
617 g_raid_md_promise_supported(int level, int qual, int disks, int force)
618 {
619
620         if (disks > PROMISE_MAX_DISKS)
621                 return (0);
622         switch (level) {
623         case G_RAID_VOLUME_RL_RAID0:
624                 if (disks < 1)
625                         return (0);
626                 if (!force && disks < 2)
627                         return (0);
628                 break;
629         case G_RAID_VOLUME_RL_RAID1:
630                 if (disks < 1)
631                         return (0);
632                 if (!force && (disks != 2))
633                         return (0);
634                 break;
635         case G_RAID_VOLUME_RL_RAID1E:
636                 if (disks < 2)
637                         return (0);
638                 if (disks % 2 != 0)
639                         return (0);
640                 if (!force && (disks != 4))
641                         return (0);
642                 break;
643         case G_RAID_VOLUME_RL_SINGLE:
644                 if (disks != 1)
645                         return (0);
646                 break;
647         case G_RAID_VOLUME_RL_CONCAT:
648                 if (disks < 2)
649                         return (0);
650                 break;
651         case G_RAID_VOLUME_RL_RAID5:
652                 if (disks < 3)
653                         return (0);
654                 if (qual != G_RAID_VOLUME_RLQ_R5LA)
655                         return (0);
656                 break;
657         default:
658                 return (0);
659         }
660         if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE)
661                 return (0);
662         return (1);
663 }
664
665 static int
666 g_raid_md_promise_start_disk(struct g_raid_disk *disk, int sdn,
667     struct g_raid_volume *vol)
668 {
669         struct g_raid_softc *sc;
670         struct g_raid_subdisk *sd;
671         struct g_raid_md_promise_perdisk *pd;
672         struct g_raid_md_promise_pervolume *pv;
673         struct promise_raid_conf *meta;
674         off_t eoff, esize, size;
675         int disk_pos, md_disk_pos, i, resurrection = 0;
676
677         sc = disk->d_softc;
678         pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
679
680         pv = vol->v_md_data;
681         meta = pv->pv_meta;
682
683         if (sdn >= 0) {
684                 /* Find disk position in metadata by its serial. */
685                 md_disk_pos = promise_meta_find_disk(meta, pd->pd_meta[sdn]->disk.id);
686                 /* For RAID0+1 we need to translate order. */
687                 disk_pos = promise_meta_translate_disk(vol, md_disk_pos);
688         } else {
689                 md_disk_pos = -1;
690                 disk_pos = -1;
691         }
692         if (disk_pos < 0) {
693                 G_RAID_DEBUG1(1, sc, "Disk %s is not part of the volume %s",
694                     g_raid_get_diskname(disk), vol->v_name);
695                 /* Failed stale disk is useless for us. */
696                 if (sdn >= 0 &&
697                     pd->pd_meta[sdn]->disk.flags & PROMISE_F_DOWN) {
698                         g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED);
699                         return (0);
700                 }
701                 /* If we were given specific metadata subdisk - erase it. */
702                 if (sdn >= 0) {
703                         free(pd->pd_meta[sdn], M_MD_PROMISE);
704                         for (i = sdn; i < pd->pd_subdisks - 1; i++)
705                                 pd->pd_meta[i] = pd->pd_meta[i + 1];
706                         pd->pd_meta[pd->pd_subdisks - 1] = NULL;
707                         pd->pd_subdisks--;
708                 }
709                 /* If we are in the start process, that's all for now. */
710                 if (!pv->pv_started)
711                         goto nofit;
712                 /*
713                  * If we have already started - try to get use of the disk.
714                  * Try to replace OFFLINE disks first, then FAILED.
715                  */
716                 promise_meta_unused_range(pd->pd_meta, pd->pd_subdisks,
717                     disk->d_consumer->provider->mediasize /
718                     disk->d_consumer->provider->sectorsize,
719                     &eoff, &esize);
720                 if (esize == 0) {
721                         G_RAID_DEBUG1(1, sc, "No free space on disk %s",
722                             g_raid_get_diskname(disk));
723                         goto nofit;
724                 }
725                 size = INT64_MAX;
726                 for (i = 0; i < vol->v_disks_count; i++) {
727                         sd = &vol->v_subdisks[i];
728                         if (sd->sd_state != G_RAID_SUBDISK_S_NONE)
729                                 size = sd->sd_size;
730                         if (sd->sd_state <= G_RAID_SUBDISK_S_FAILED &&
731                             (disk_pos < 0 ||
732                              vol->v_subdisks[i].sd_state < sd->sd_state))
733                                 disk_pos = i;
734                 }
735                 if (disk_pos >= 0 &&
736                     vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT &&
737                     (off_t)esize * 512 < size) {
738                         G_RAID_DEBUG1(1, sc, "Disk %s free space "
739                             "is too small (%ju < %ju)",
740                             g_raid_get_diskname(disk),
741                             (off_t)esize * 512, size);
742                         disk_pos = -1;
743                 }
744                 if (disk_pos >= 0) {
745                         if (vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT)
746                                 esize = size / 512;
747                         /* For RAID0+1 we need to translate order. */
748                         md_disk_pos = promise_meta_translate_disk(vol, disk_pos);
749                 } else {
750 nofit:
751                         if (pd->pd_subdisks == 0) {
752                                 g_raid_change_disk_state(disk,
753                                     G_RAID_DISK_S_SPARE);
754                         }
755                         return (0);
756                 }
757                 G_RAID_DEBUG1(1, sc, "Disk %s takes pos %d in the volume %s",
758                     g_raid_get_diskname(disk), disk_pos, vol->v_name);
759                 resurrection = 1;
760         }
761
762         sd = &vol->v_subdisks[disk_pos];
763
764         if (resurrection && sd->sd_disk != NULL) {
765                 g_raid_change_disk_state(sd->sd_disk,
766                     G_RAID_DISK_S_STALE_FAILED);
767                 TAILQ_REMOVE(&sd->sd_disk->d_subdisks,
768                     sd, sd_next);
769         }
770         vol->v_subdisks[disk_pos].sd_disk = disk;
771         TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
772
773         /* Welcome the new disk. */
774         if (resurrection)
775                 g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
776         else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN)
777                 g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED);
778         else
779                 g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
780
781         if (resurrection) {
782                 sd->sd_offset = (off_t)eoff * 512;
783                 sd->sd_size = (off_t)esize * 512;
784         } else {
785                 sd->sd_offset = (((off_t)pd->pd_meta[sdn]->disk_offset_high
786                     << 32) + pd->pd_meta[sdn]->disk_offset) * 512;
787                 sd->sd_size = (((off_t)pd->pd_meta[sdn]->disk_sectors_high
788                     << 32) + pd->pd_meta[sdn]->disk_sectors) * 512;
789         }
790
791         if (resurrection) {
792                 /* Stale disk, almost same as new. */
793                 g_raid_change_subdisk_state(sd,
794                     G_RAID_SUBDISK_S_NEW);
795         } else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN) {
796                 /* Failed disk. */
797                 g_raid_change_subdisk_state(sd,
798                     G_RAID_SUBDISK_S_FAILED);
799         } else if (meta->disks[md_disk_pos].flags & PROMISE_F_REDIR) {
800                 /* Rebuilding disk. */
801                 g_raid_change_subdisk_state(sd,
802                     G_RAID_SUBDISK_S_REBUILD);
803                 if (pd->pd_meta[sdn]->generation != meta->generation)
804                         sd->sd_rebuild_pos = 0;
805                 else {
806                         sd->sd_rebuild_pos =
807                             (((off_t)pd->pd_meta[sdn]->disk_rebuild_high << 32) +
808                              pd->pd_meta[sdn]->disk_rebuild) * 512;
809                 }
810         } else if (!(meta->disks[md_disk_pos].flags & PROMISE_F_ONLINE)) {
811                 /* Rebuilding disk. */
812                 g_raid_change_subdisk_state(sd,
813                     G_RAID_SUBDISK_S_NEW);
814         } else if (pd->pd_meta[sdn]->generation != meta->generation ||
815             (meta->status & PROMISE_S_MARKED)) {
816                 /* Stale disk or dirty volume (unclean shutdown). */
817                 g_raid_change_subdisk_state(sd,
818                     G_RAID_SUBDISK_S_STALE);
819         } else {
820                 /* Up to date disk. */
821                 g_raid_change_subdisk_state(sd,
822                     G_RAID_SUBDISK_S_ACTIVE);
823         }
824         g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
825             G_RAID_EVENT_SUBDISK);
826
827         return (resurrection);
828 }
829
830 static void
831 g_raid_md_promise_refill(struct g_raid_softc *sc)
832 {
833         struct g_raid_volume *vol;
834         struct g_raid_subdisk *sd;
835         struct g_raid_disk *disk;
836         struct g_raid_md_object *md;
837         struct g_raid_md_promise_perdisk *pd;
838         struct g_raid_md_promise_pervolume *pv;
839         int update, updated, i, bad;
840
841         md = sc->sc_md;
842 restart:
843         updated = 0;
844         TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
845                 pv = vol->v_md_data;
846                 if (!pv->pv_started || vol->v_stopping)
847                         continue;
848
849                 /* Search for subdisk that needs replacement. */
850                 bad = 0;
851                 for (i = 0; i < vol->v_disks_count; i++) {
852                         sd = &vol->v_subdisks[i];
853                         if (sd->sd_state == G_RAID_SUBDISK_S_NONE ||
854                             sd->sd_state == G_RAID_SUBDISK_S_FAILED)
855                                 bad = 1;
856                 }
857                 if (!bad)
858                         continue;
859
860                 G_RAID_DEBUG1(1, sc, "Volume %s is not complete, "
861                     "trying to refill.", vol->v_name);
862
863                 TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
864                         /* Skip failed. */
865                         if (disk->d_state < G_RAID_DISK_S_SPARE)
866                                 continue;
867                         /* Skip already used by this volume. */
868                         for (i = 0; i < vol->v_disks_count; i++) {
869                                 sd = &vol->v_subdisks[i];
870                                 if (sd->sd_disk == disk)
871                                         break;
872                         }
873                         if (i < vol->v_disks_count)
874                                 continue;
875
876                         /* Try to use disk if it has empty extents. */
877                         pd = disk->d_md_data;
878                         if (pd->pd_subdisks < PROMISE_MAX_SUBDISKS) {
879                                 update =
880                                     g_raid_md_promise_start_disk(disk, -1, vol);
881                         } else
882                                 update = 0;
883                         if (update) {
884                                 updated = 1;
885                                 g_raid_md_write_promise(md, vol, NULL, disk);
886                                 break;
887                         }
888                 }
889         }
890         if (updated)
891                 goto restart;
892 }
893
894 static void
895 g_raid_md_promise_start(struct g_raid_volume *vol)
896 {
897         struct g_raid_softc *sc;
898         struct g_raid_subdisk *sd;
899         struct g_raid_disk *disk;
900         struct g_raid_md_object *md;
901         struct g_raid_md_promise_perdisk *pd;
902         struct g_raid_md_promise_pervolume *pv;
903         struct promise_raid_conf *meta;
904         u_int i;
905
906         sc = vol->v_softc;
907         md = sc->sc_md;
908         pv = vol->v_md_data;
909         meta = pv->pv_meta;
910
911         vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE;
912         if (meta->type == PROMISE_T_RAID0)
913                 vol->v_raid_level = G_RAID_VOLUME_RL_RAID0;
914         else if (meta->type == PROMISE_T_RAID1) {
915                 if (meta->array_width == 1)
916                         vol->v_raid_level = G_RAID_VOLUME_RL_RAID1;
917                 else
918                         vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E;
919         } else if (meta->type == PROMISE_T_RAID3)
920                 vol->v_raid_level = G_RAID_VOLUME_RL_RAID3;
921         else if (meta->type == PROMISE_T_RAID5) {
922                 vol->v_raid_level = G_RAID_VOLUME_RL_RAID5;
923                 vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA;
924         } else if (meta->type == PROMISE_T_SPAN)
925                 vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT;
926         else if (meta->type == PROMISE_T_JBOD)
927                 vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE;
928         else
929                 vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
930         vol->v_strip_size = 512 << meta->stripe_shift; //ZZZ
931         vol->v_disks_count = meta->total_disks;
932         vol->v_mediasize = (off_t)meta->total_sectors * 512; //ZZZ
933         if (meta->total_sectors_high < 256) /* If value looks sane. */
934                 vol->v_mediasize +=
935                     ((off_t)meta->total_sectors_high << 32) * 512; //ZZZ
936         vol->v_sectorsize = 512 * meta->sector_size;
937         for (i = 0; i < vol->v_disks_count; i++) {
938                 sd = &vol->v_subdisks[i];
939                 sd->sd_offset = (((off_t)meta->disk_offset_high << 32) +
940                     meta->disk_offset) * 512;
941                 sd->sd_size = (((off_t)meta->disk_sectors_high << 32) +
942                     meta->disk_sectors) * 512;
943         }
944         g_raid_start_volume(vol);
945
946         /* Make all disks found till the moment take their places. */
947         TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
948                 pd = disk->d_md_data;
949                 for (i = 0; i < pd->pd_subdisks; i++) {
950                         if (pd->pd_meta[i]->volume_id == meta->volume_id)
951                                 g_raid_md_promise_start_disk(disk, i, vol);
952                 }
953         }
954
955         pv->pv_started = 1;
956         callout_stop(&pv->pv_start_co);
957         G_RAID_DEBUG1(0, sc, "Volume started.");
958         g_raid_md_write_promise(md, vol, NULL, NULL);
959
960         /* Pickup any STALE/SPARE disks to refill array if needed. */
961         g_raid_md_promise_refill(sc);
962
963         g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME);
964 }
965
966 static void
967 g_raid_promise_go(void *arg)
968 {
969         struct g_raid_volume *vol;
970         struct g_raid_softc *sc;
971         struct g_raid_md_promise_pervolume *pv;
972
973         vol = arg;
974         pv = vol->v_md_data;
975         sc = vol->v_softc;
976         if (!pv->pv_started) {
977                 G_RAID_DEBUG1(0, sc, "Force volume start due to timeout.");
978                 g_raid_event_send(vol, G_RAID_VOLUME_E_STARTMD,
979                     G_RAID_EVENT_VOLUME);
980         }
981 }
982
983 static void
984 g_raid_md_promise_new_disk(struct g_raid_disk *disk)
985 {
986         struct g_raid_softc *sc;
987         struct g_raid_md_object *md;
988         struct promise_raid_conf *pdmeta;
989         struct g_raid_md_promise_perdisk *pd;
990         struct g_raid_md_promise_pervolume *pv;
991         struct g_raid_volume *vol;
992         int i;
993         char buf[33];
994
995         sc = disk->d_softc;
996         md = sc->sc_md;
997         pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
998
999         if (pd->pd_subdisks == 0) {
1000                 g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
1001                 g_raid_md_promise_refill(sc);
1002                 return;
1003         }
1004
1005         for (i = 0; i < pd->pd_subdisks; i++) {
1006                 pdmeta = pd->pd_meta[i];
1007
1008                 /* Look for volume with matching ID. */
1009                 vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id);
1010                 if (vol == NULL) {
1011                         promise_meta_get_name(pdmeta, buf);
1012                         vol = g_raid_create_volume(sc, buf, pdmeta->array_number);
1013                         pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK | M_ZERO);
1014                         pv->pv_id = pdmeta->volume_id;
1015                         vol->v_md_data = pv;
1016                         callout_init(&pv->pv_start_co, 1);
1017                         callout_reset(&pv->pv_start_co,
1018                             g_raid_start_timeout * hz,
1019                             g_raid_promise_go, vol);
1020                 } else
1021                         pv = vol->v_md_data;
1022
1023                 /* If we haven't started yet - check metadata freshness. */
1024                 if (pv->pv_meta == NULL || !pv->pv_started) {
1025                         if (pv->pv_meta == NULL ||
1026                             ((int16_t)(pdmeta->generation - pv->pv_generation)) > 0) {
1027                                 G_RAID_DEBUG1(1, sc, "Newer disk");
1028                                 if (pv->pv_meta != NULL)
1029                                         free(pv->pv_meta, M_MD_PROMISE);
1030                                 pv->pv_meta = promise_meta_copy(pdmeta);
1031                                 pv->pv_generation = pv->pv_meta->generation;
1032                                 pv->pv_disks_present = 1;
1033                         } else if (pdmeta->generation == pv->pv_generation) {
1034                                 pv->pv_disks_present++;
1035                                 G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)",
1036                                     pv->pv_disks_present,
1037                                     pv->pv_meta->total_disks);
1038                         } else {
1039                                 G_RAID_DEBUG1(1, sc, "Older disk");
1040                         }
1041                 }
1042         }
1043
1044         for (i = 0; i < pd->pd_subdisks; i++) {
1045                 pdmeta = pd->pd_meta[i];
1046
1047                 /* Look for volume with matching ID. */
1048                 vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id);
1049                 if (vol == NULL)
1050                         continue;
1051                 pv = vol->v_md_data;
1052
1053                 if (pv->pv_started) {
1054                         if (g_raid_md_promise_start_disk(disk, i, vol))
1055                                 g_raid_md_write_promise(md, vol, NULL, NULL);
1056                 } else {
1057                         /* If we collected all needed disks - start array. */
1058                         if (pv->pv_disks_present == pv->pv_meta->total_disks)
1059                                 g_raid_md_promise_start(vol);
1060                 }
1061         }
1062 }
1063
1064 static int
1065 g_raid_md_create_promise(struct g_raid_md_object *md, struct g_class *mp,
1066     struct g_geom **gp)
1067 {
1068         struct g_geom *geom;
1069         struct g_raid_softc *sc;
1070
1071         /* Search for existing node. */
1072         LIST_FOREACH(geom, &mp->geom, geom) {
1073                 sc = geom->softc;
1074                 if (sc == NULL)
1075                         continue;
1076                 if (sc->sc_stopping != 0)
1077                         continue;
1078                 if (sc->sc_md->mdo_class != md->mdo_class)
1079                         continue;
1080                 break;
1081         }
1082         if (geom != NULL) {
1083                 *gp = geom;
1084                 return (G_RAID_MD_TASTE_EXISTING);
1085         }
1086
1087         /* Create new one if not found. */
1088         sc = g_raid_create_node(mp, "Promise", md);
1089         if (sc == NULL)
1090                 return (G_RAID_MD_TASTE_FAIL);
1091         md->mdo_softc = sc;
1092         *gp = sc->sc_geom;
1093         return (G_RAID_MD_TASTE_NEW);
1094 }
1095
1096 static int
1097 g_raid_md_taste_promise(struct g_raid_md_object *md, struct g_class *mp,
1098                               struct g_consumer *cp, struct g_geom **gp)
1099 {
1100         struct g_consumer *rcp;
1101         struct g_provider *pp;
1102         struct g_raid_softc *sc;
1103         struct g_raid_disk *disk;
1104         struct promise_raid_conf *metaarr[4];
1105         struct g_raid_md_promise_perdisk *pd;
1106         struct g_geom *geom;
1107         int i, j, result, len, subdisks;
1108         char name[16];
1109         uint16_t vendor;
1110
1111         G_RAID_DEBUG(1, "Tasting Promise on %s", cp->provider->name);
1112         pp = cp->provider;
1113
1114         /* Read metadata from device. */
1115         g_topology_unlock();
1116         vendor = 0xffff;
1117         len = sizeof(vendor);
1118         if (pp->geom->rank == 1)
1119                 g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor);
1120         subdisks = promise_meta_read(cp, metaarr);
1121         g_topology_lock();
1122         if (subdisks == 0) {
1123                 if (g_raid_aggressive_spare) {
1124                         if (vendor == 0x105a || vendor == 0x1002) {
1125                                 G_RAID_DEBUG(1,
1126                                     "No Promise metadata, forcing spare.");
1127                                 goto search;
1128                         } else {
1129                                 G_RAID_DEBUG(1,
1130                                     "Promise/ATI vendor mismatch "
1131                                     "0x%04x != 0x105a/0x1002",
1132                                     vendor);
1133                         }
1134                 }
1135                 return (G_RAID_MD_TASTE_FAIL);
1136         }
1137
1138         /* Metadata valid. Print it. */
1139         for (i = 0; i < subdisks; i++)
1140                 g_raid_md_promise_print(metaarr[i]);
1141
1142         /* Purge meaningless (empty/spare) records. */
1143         for (i = 0; i < subdisks; ) {
1144                 if (metaarr[i]->disk.flags & PROMISE_F_ASSIGNED) {
1145                         i++;
1146                         continue;
1147                 }
1148                 free(metaarr[i], M_MD_PROMISE);
1149                 for (j = i; j < subdisks - 1; j++)
1150                         metaarr[i] = metaarr[j + 1];
1151                 metaarr[subdisks - 1] = NULL;
1152                 subdisks--;
1153         }
1154
1155 search:
1156         /* Search for matching node. */
1157         sc = NULL;
1158         LIST_FOREACH(geom, &mp->geom, geom) {
1159                 sc = geom->softc;
1160                 if (sc == NULL)
1161                         continue;
1162                 if (sc->sc_stopping != 0)
1163                         continue;
1164                 if (sc->sc_md->mdo_class != md->mdo_class)
1165                         continue;
1166                 break;
1167         }
1168
1169         /* Found matching node. */
1170         if (geom != NULL) {
1171                 G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name);
1172                 result = G_RAID_MD_TASTE_EXISTING;
1173
1174         } else { /* Not found matching node -- create one. */
1175                 result = G_RAID_MD_TASTE_NEW;
1176                 snprintf(name, sizeof(name), "Promise");
1177                 sc = g_raid_create_node(mp, name, md);
1178                 md->mdo_softc = sc;
1179                 geom = sc->sc_geom;
1180         }
1181
1182         /* There is no return after this point, so we close passed consumer. */
1183         g_access(cp, -1, 0, 0);
1184
1185         rcp = g_new_consumer(geom);
1186         rcp->flags |= G_CF_DIRECT_RECEIVE;
1187         g_attach(rcp, pp);
1188         if (g_access(rcp, 1, 1, 1) != 0)
1189                 ; //goto fail1;
1190
1191         g_topology_unlock();
1192         sx_xlock(&sc->sc_lock);
1193
1194         pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO);
1195         pd->pd_subdisks = subdisks;
1196         for (i = 0; i < subdisks; i++)
1197                 pd->pd_meta[i] = metaarr[i];
1198         disk = g_raid_create_disk(sc);
1199         disk->d_md_data = (void *)pd;
1200         disk->d_consumer = rcp;
1201         rcp->private = disk;
1202
1203         g_raid_get_disk_info(disk);
1204
1205         g_raid_md_promise_new_disk(disk);
1206
1207         sx_xunlock(&sc->sc_lock);
1208         g_topology_lock();
1209         *gp = geom;
1210         return (result);
1211 }
1212
1213 static int
1214 g_raid_md_event_promise(struct g_raid_md_object *md,
1215     struct g_raid_disk *disk, u_int event)
1216 {
1217         struct g_raid_softc *sc;
1218
1219         sc = md->mdo_softc;
1220         if (disk == NULL)
1221                 return (-1);
1222         switch (event) {
1223         case G_RAID_DISK_E_DISCONNECTED:
1224                 /* Delete disk. */
1225                 g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
1226                 g_raid_destroy_disk(disk);
1227                 g_raid_md_promise_purge_volumes(sc);
1228
1229                 /* Write updated metadata to all disks. */
1230                 g_raid_md_write_promise(md, NULL, NULL, NULL);
1231
1232                 /* Check if anything left. */
1233                 if (g_raid_ndisks(sc, -1) == 0)
1234                         g_raid_destroy_node(sc, 0);
1235                 else
1236                         g_raid_md_promise_refill(sc);
1237                 return (0);
1238         }
1239         return (-2);
1240 }
1241
1242 static int
1243 g_raid_md_volume_event_promise(struct g_raid_md_object *md,
1244     struct g_raid_volume *vol, u_int event)
1245 {
1246         struct g_raid_md_promise_pervolume *pv;
1247
1248         pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data;
1249         switch (event) {
1250         case G_RAID_VOLUME_E_STARTMD:
1251                 if (!pv->pv_started)
1252                         g_raid_md_promise_start(vol);
1253                 return (0);
1254         }
1255         return (-2);
1256 }
1257
1258 static int
1259 g_raid_md_ctl_promise(struct g_raid_md_object *md,
1260     struct gctl_req *req)
1261 {
1262         struct g_raid_softc *sc;
1263         struct g_raid_volume *vol, *vol1;
1264         struct g_raid_subdisk *sd;
1265         struct g_raid_disk *disk, *disks[PROMISE_MAX_DISKS];
1266         struct g_raid_md_promise_perdisk *pd;
1267         struct g_raid_md_promise_pervolume *pv;
1268         struct g_consumer *cp;
1269         struct g_provider *pp;
1270         char arg[16];
1271         const char *nodename, *verb, *volname, *levelname, *diskname;
1272         char *tmp;
1273         int *nargs, *force;
1274         off_t esize, offs[PROMISE_MAX_DISKS], size, sectorsize, strip;
1275         intmax_t *sizearg, *striparg;
1276         int numdisks, i, len, level, qual;
1277         int error;
1278
1279         sc = md->mdo_softc;
1280         verb = gctl_get_param(req, "verb", NULL);
1281         nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
1282         error = 0;
1283         if (strcmp(verb, "label") == 0) {
1284
1285                 if (*nargs < 4) {
1286                         gctl_error(req, "Invalid number of arguments.");
1287                         return (-1);
1288                 }
1289                 volname = gctl_get_asciiparam(req, "arg1");
1290                 if (volname == NULL) {
1291                         gctl_error(req, "No volume name.");
1292                         return (-2);
1293                 }
1294                 levelname = gctl_get_asciiparam(req, "arg2");
1295                 if (levelname == NULL) {
1296                         gctl_error(req, "No RAID level.");
1297                         return (-3);
1298                 }
1299                 if (strcasecmp(levelname, "RAID5") == 0)
1300                         levelname = "RAID5-LA";
1301                 if (g_raid_volume_str2level(levelname, &level, &qual)) {
1302                         gctl_error(req, "Unknown RAID level '%s'.", levelname);
1303                         return (-4);
1304                 }
1305                 numdisks = *nargs - 3;
1306                 force = gctl_get_paraml(req, "force", sizeof(*force));
1307                 if (!g_raid_md_promise_supported(level, qual, numdisks,
1308                     force ? *force : 0)) {
1309                         gctl_error(req, "Unsupported RAID level "
1310                             "(0x%02x/0x%02x), or number of disks (%d).",
1311                             level, qual, numdisks);
1312                         return (-5);
1313                 }
1314
1315                 /* Search for disks, connect them and probe. */
1316                 size = INT64_MAX;
1317                 sectorsize = 0;
1318                 bzero(disks, sizeof(disks));
1319                 bzero(offs, sizeof(offs));
1320                 for (i = 0; i < numdisks; i++) {
1321                         snprintf(arg, sizeof(arg), "arg%d", i + 3);
1322                         diskname = gctl_get_asciiparam(req, arg);
1323                         if (diskname == NULL) {
1324                                 gctl_error(req, "No disk name (%s).", arg);
1325                                 error = -6;
1326                                 break;
1327                         }
1328                         if (strcmp(diskname, "NONE") == 0)
1329                                 continue;
1330
1331                         TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1332                                 if (disk->d_consumer != NULL && 
1333                                     disk->d_consumer->provider != NULL &&
1334                                     strcmp(disk->d_consumer->provider->name,
1335                                      diskname) == 0)
1336                                         break;
1337                         }
1338                         if (disk != NULL) {
1339                                 if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
1340                                         gctl_error(req, "Disk '%s' is in a "
1341                                             "wrong state (%s).", diskname,
1342                                             g_raid_disk_state2str(disk->d_state));
1343                                         error = -7;
1344                                         break;
1345                                 }
1346                                 pd = disk->d_md_data;
1347                                 if (pd->pd_subdisks >= PROMISE_MAX_SUBDISKS) {
1348                                         gctl_error(req, "Disk '%s' already "
1349                                             "used by %d volumes.",
1350                                             diskname, pd->pd_subdisks);
1351                                         error = -7;
1352                                         break;
1353                                 }
1354                                 pp = disk->d_consumer->provider;
1355                                 disks[i] = disk;
1356                                 promise_meta_unused_range(pd->pd_meta,
1357                                     pd->pd_subdisks,
1358                                     pp->mediasize / pp->sectorsize,
1359                                     &offs[i], &esize);
1360                                 size = MIN(size, (off_t)esize * pp->sectorsize);
1361                                 sectorsize = MAX(sectorsize, pp->sectorsize);
1362                                 continue;
1363                         }
1364
1365                         g_topology_lock();
1366                         cp = g_raid_open_consumer(sc, diskname);
1367                         if (cp == NULL) {
1368                                 gctl_error(req, "Can't open disk '%s'.",
1369                                     diskname);
1370                                 g_topology_unlock();
1371                                 error = -8;
1372                                 break;
1373                         }
1374                         pp = cp->provider;
1375                         pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO);
1376                         disk = g_raid_create_disk(sc);
1377                         disk->d_md_data = (void *)pd;
1378                         disk->d_consumer = cp;
1379                         disks[i] = disk;
1380                         cp->private = disk;
1381                         g_topology_unlock();
1382
1383                         g_raid_get_disk_info(disk);
1384
1385                         /* Reserve some space for metadata. */
1386                         size = MIN(size, pp->mediasize - 131072llu * pp->sectorsize);
1387                         sectorsize = MAX(sectorsize, pp->sectorsize);
1388                 }
1389                 if (error != 0) {
1390                         for (i = 0; i < numdisks; i++) {
1391                                 if (disks[i] != NULL &&
1392                                     disks[i]->d_state == G_RAID_DISK_S_NONE)
1393                                         g_raid_destroy_disk(disks[i]);
1394                         }
1395                         return (error);
1396                 }
1397
1398                 if (sectorsize <= 0) {
1399                         gctl_error(req, "Can't get sector size.");
1400                         return (-8);
1401                 }
1402
1403                 /* Handle size argument. */
1404                 len = sizeof(*sizearg);
1405                 sizearg = gctl_get_param(req, "size", &len);
1406                 if (sizearg != NULL && len == sizeof(*sizearg) &&
1407                     *sizearg > 0) {
1408                         if (*sizearg > size) {
1409                                 gctl_error(req, "Size too big %lld > %lld.",
1410                                     (long long)*sizearg, (long long)size);
1411                                 return (-9);
1412                         }
1413                         size = *sizearg;
1414                 }
1415
1416                 /* Handle strip argument. */
1417                 strip = 131072;
1418                 len = sizeof(*striparg);
1419                 striparg = gctl_get_param(req, "strip", &len);
1420                 if (striparg != NULL && len == sizeof(*striparg) &&
1421                     *striparg > 0) {
1422                         if (*striparg < sectorsize) {
1423                                 gctl_error(req, "Strip size too small.");
1424                                 return (-10);
1425                         }
1426                         if (*striparg % sectorsize != 0) {
1427                                 gctl_error(req, "Incorrect strip size.");
1428                                 return (-11);
1429                         }
1430                         strip = *striparg;
1431                 }
1432
1433                 /* Round size down to strip or sector. */
1434                 if (level == G_RAID_VOLUME_RL_RAID1 ||
1435                     level == G_RAID_VOLUME_RL_SINGLE ||
1436                     level == G_RAID_VOLUME_RL_CONCAT)
1437                         size -= (size % sectorsize);
1438                 else if (level == G_RAID_VOLUME_RL_RAID1E &&
1439                     (numdisks & 1) != 0)
1440                         size -= (size % (2 * strip));
1441                 else
1442                         size -= (size % strip);
1443                 if (size <= 0) {
1444                         gctl_error(req, "Size too small.");
1445                         return (-13);
1446                 }
1447
1448                 /* We have all we need, create things: volume, ... */
1449                 pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK | M_ZERO);
1450                 arc4rand(&pv->pv_id, sizeof(pv->pv_id), 0);
1451                 pv->pv_generation = 0;
1452                 pv->pv_started = 1;
1453                 vol = g_raid_create_volume(sc, volname, -1);
1454                 vol->v_md_data = pv;
1455                 vol->v_raid_level = level;
1456                 vol->v_raid_level_qualifier = qual;
1457                 vol->v_strip_size = strip;
1458                 vol->v_disks_count = numdisks;
1459                 if (level == G_RAID_VOLUME_RL_RAID0 ||
1460                     level == G_RAID_VOLUME_RL_CONCAT ||
1461                     level == G_RAID_VOLUME_RL_SINGLE)
1462                         vol->v_mediasize = size * numdisks;
1463                 else if (level == G_RAID_VOLUME_RL_RAID1)
1464                         vol->v_mediasize = size;
1465                 else if (level == G_RAID_VOLUME_RL_RAID3 ||
1466                     level == G_RAID_VOLUME_RL_RAID5)
1467                         vol->v_mediasize = size * (numdisks - 1);
1468                 else { /* RAID1E */
1469                         vol->v_mediasize = ((size * numdisks) / strip / 2) *
1470                             strip;
1471                 }
1472                 vol->v_sectorsize = sectorsize;
1473                 g_raid_start_volume(vol);
1474
1475                 /* , and subdisks. */
1476                 for (i = 0; i < numdisks; i++) {
1477                         disk = disks[i];
1478                         sd = &vol->v_subdisks[i];
1479                         sd->sd_disk = disk;
1480                         sd->sd_offset = (off_t)offs[i] * 512;
1481                         sd->sd_size = size;
1482                         if (disk == NULL)
1483                                 continue;
1484                         TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
1485                         g_raid_change_disk_state(disk,
1486                             G_RAID_DISK_S_ACTIVE);
1487                         g_raid_change_subdisk_state(sd,
1488                             G_RAID_SUBDISK_S_ACTIVE);
1489                         g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
1490                             G_RAID_EVENT_SUBDISK);
1491                 }
1492
1493                 /* Write metadata based on created entities. */
1494                 G_RAID_DEBUG1(0, sc, "Array started.");
1495                 g_raid_md_write_promise(md, vol, NULL, NULL);
1496
1497                 /* Pickup any STALE/SPARE disks to refill array if needed. */
1498                 g_raid_md_promise_refill(sc);
1499
1500                 g_raid_event_send(vol, G_RAID_VOLUME_E_START,
1501                     G_RAID_EVENT_VOLUME);
1502                 return (0);
1503         }
1504         if (strcmp(verb, "add") == 0) {
1505
1506                 gctl_error(req, "`add` command is not applicable, "
1507                     "use `label` instead.");
1508                 return (-99);
1509         }
1510         if (strcmp(verb, "delete") == 0) {
1511
1512                 nodename = gctl_get_asciiparam(req, "arg0");
1513                 if (nodename != NULL && strcasecmp(sc->sc_name, nodename) != 0)
1514                         nodename = NULL;
1515
1516                 /* Full node destruction. */
1517                 if (*nargs == 1 && nodename != NULL) {
1518                         /* Check if some volume is still open. */
1519                         force = gctl_get_paraml(req, "force", sizeof(*force));
1520                         if (force != NULL && *force == 0 &&
1521                             g_raid_nopens(sc) != 0) {
1522                                 gctl_error(req, "Some volume is still open.");
1523                                 return (-4);
1524                         }
1525
1526                         TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1527                                 if (disk->d_consumer)
1528                                         promise_meta_erase(disk->d_consumer);
1529                         }
1530                         g_raid_destroy_node(sc, 0);
1531                         return (0);
1532                 }
1533
1534                 /* Destroy specified volume. If it was last - all node. */
1535                 if (*nargs > 2) {
1536                         gctl_error(req, "Invalid number of arguments.");
1537                         return (-1);
1538                 }
1539                 volname = gctl_get_asciiparam(req,
1540                     nodename != NULL ? "arg1" : "arg0");
1541                 if (volname == NULL) {
1542                         gctl_error(req, "No volume name.");
1543                         return (-2);
1544                 }
1545
1546                 /* Search for volume. */
1547                 TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1548                         if (strcmp(vol->v_name, volname) == 0)
1549                                 break;
1550                         pp = vol->v_provider;
1551                         if (pp == NULL)
1552                                 continue;
1553                         if (strcmp(pp->name, volname) == 0)
1554                                 break;
1555                         if (strncmp(pp->name, "raid/", 5) == 0 &&
1556                             strcmp(pp->name + 5, volname) == 0)
1557                                 break;
1558                 }
1559                 if (vol == NULL) {
1560                         i = strtol(volname, &tmp, 10);
1561                         if (verb != volname && tmp[0] == 0) {
1562                                 TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1563                                         if (vol->v_global_id == i)
1564                                                 break;
1565                                 }
1566                         }
1567                 }
1568                 if (vol == NULL) {
1569                         gctl_error(req, "Volume '%s' not found.", volname);
1570                         return (-3);
1571                 }
1572
1573                 /* Check if volume is still open. */
1574                 force = gctl_get_paraml(req, "force", sizeof(*force));
1575                 if (force != NULL && *force == 0 &&
1576                     vol->v_provider_open != 0) {
1577                         gctl_error(req, "Volume is still open.");
1578                         return (-4);
1579                 }
1580
1581                 /* Destroy volume and potentially node. */
1582                 i = 0;
1583                 TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next)
1584                         i++;
1585                 if (i >= 2) {
1586                         g_raid_destroy_volume(vol);
1587                         g_raid_md_promise_purge_disks(sc);
1588                         g_raid_md_write_promise(md, NULL, NULL, NULL);
1589                 } else {
1590                         TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1591                                 if (disk->d_consumer)
1592                                         promise_meta_erase(disk->d_consumer);
1593                         }
1594                         g_raid_destroy_node(sc, 0);
1595                 }
1596                 return (0);
1597         }
1598         if (strcmp(verb, "remove") == 0 ||
1599             strcmp(verb, "fail") == 0) {
1600                 if (*nargs < 2) {
1601                         gctl_error(req, "Invalid number of arguments.");
1602                         return (-1);
1603                 }
1604                 for (i = 1; i < *nargs; i++) {
1605                         snprintf(arg, sizeof(arg), "arg%d", i);
1606                         diskname = gctl_get_asciiparam(req, arg);
1607                         if (diskname == NULL) {
1608                                 gctl_error(req, "No disk name (%s).", arg);
1609                                 error = -2;
1610                                 break;
1611                         }
1612                         if (strncmp(diskname, "/dev/", 5) == 0)
1613                                 diskname += 5;
1614
1615                         TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1616                                 if (disk->d_consumer != NULL && 
1617                                     disk->d_consumer->provider != NULL &&
1618                                     strcmp(disk->d_consumer->provider->name,
1619                                      diskname) == 0)
1620                                         break;
1621                         }
1622                         if (disk == NULL) {
1623                                 gctl_error(req, "Disk '%s' not found.",
1624                                     diskname);
1625                                 error = -3;
1626                                 break;
1627                         }
1628
1629                         if (strcmp(verb, "fail") == 0) {
1630                                 g_raid_md_fail_disk_promise(md, NULL, disk);
1631                                 continue;
1632                         }
1633
1634                         /* Erase metadata on deleting disk and destroy it. */
1635                         promise_meta_erase(disk->d_consumer);
1636                         g_raid_destroy_disk(disk);
1637                 }
1638                 g_raid_md_promise_purge_volumes(sc);
1639
1640                 /* Write updated metadata to remaining disks. */
1641                 g_raid_md_write_promise(md, NULL, NULL, NULL);
1642
1643                 /* Check if anything left. */
1644                 if (g_raid_ndisks(sc, -1) == 0)
1645                         g_raid_destroy_node(sc, 0);
1646                 else
1647                         g_raid_md_promise_refill(sc);
1648                 return (error);
1649         }
1650         if (strcmp(verb, "insert") == 0) {
1651                 if (*nargs < 2) {
1652                         gctl_error(req, "Invalid number of arguments.");
1653                         return (-1);
1654                 }
1655                 for (i = 1; i < *nargs; i++) {
1656                         /* Get disk name. */
1657                         snprintf(arg, sizeof(arg), "arg%d", i);
1658                         diskname = gctl_get_asciiparam(req, arg);
1659                         if (diskname == NULL) {
1660                                 gctl_error(req, "No disk name (%s).", arg);
1661                                 error = -3;
1662                                 break;
1663                         }
1664
1665                         /* Try to find provider with specified name. */
1666                         g_topology_lock();
1667                         cp = g_raid_open_consumer(sc, diskname);
1668                         if (cp == NULL) {
1669                                 gctl_error(req, "Can't open disk '%s'.",
1670                                     diskname);
1671                                 g_topology_unlock();
1672                                 error = -4;
1673                                 break;
1674                         }
1675                         pp = cp->provider;
1676                         g_topology_unlock();
1677
1678                         pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO);
1679
1680                         disk = g_raid_create_disk(sc);
1681                         disk->d_consumer = cp;
1682                         disk->d_md_data = (void *)pd;
1683                         cp->private = disk;
1684
1685                         g_raid_get_disk_info(disk);
1686
1687                         /* Welcome the "new" disk. */
1688                         g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
1689                         promise_meta_write_spare(cp);
1690                         g_raid_md_promise_refill(sc);
1691                 }
1692                 return (error);
1693         }
1694         return (-100);
1695 }
1696
1697 static int
1698 g_raid_md_write_promise(struct g_raid_md_object *md, struct g_raid_volume *tvol,
1699     struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
1700 {
1701         struct g_raid_softc *sc;
1702         struct g_raid_volume *vol;
1703         struct g_raid_subdisk *sd;
1704         struct g_raid_disk *disk;
1705         struct g_raid_md_promise_perdisk *pd;
1706         struct g_raid_md_promise_pervolume *pv;
1707         struct promise_raid_conf *meta;
1708         off_t rebuild_lba64;
1709         int i, j, pos, rebuild;
1710
1711         sc = md->mdo_softc;
1712
1713         if (sc->sc_stopping == G_RAID_DESTROY_HARD)
1714                 return (0);
1715
1716         /* Generate new per-volume metadata for affected volumes. */
1717         TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1718                 if (vol->v_stopping)
1719                         continue;
1720
1721                 /* Skip volumes not related to specified targets. */
1722                 if (tvol != NULL && vol != tvol)
1723                         continue;
1724                 if (tsd != NULL && vol != tsd->sd_volume)
1725                         continue;
1726                 if (tdisk != NULL) {
1727                         for (i = 0; i < vol->v_disks_count; i++) {
1728                                 if (vol->v_subdisks[i].sd_disk == tdisk)
1729                                         break;
1730                         }
1731                         if (i >= vol->v_disks_count)
1732                                 continue;
1733                 }
1734
1735                 pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data;
1736                 pv->pv_generation++;
1737
1738                 meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK | M_ZERO);
1739                 if (pv->pv_meta != NULL)
1740                         memcpy(meta, pv->pv_meta, sizeof(*meta));
1741                 memcpy(meta->promise_id, PROMISE_MAGIC,
1742                     sizeof(PROMISE_MAGIC) - 1);
1743                 meta->dummy_0 = 0x00020000;
1744                 meta->integrity = PROMISE_I_VALID;
1745
1746                 meta->generation = pv->pv_generation;
1747                 meta->status = PROMISE_S_VALID | PROMISE_S_ONLINE |
1748                     PROMISE_S_INITED | PROMISE_S_READY;
1749                 if (vol->v_state <= G_RAID_VOLUME_S_DEGRADED)
1750                         meta->status |= PROMISE_S_DEGRADED;
1751                 if (vol->v_dirty)
1752                         meta->status |= PROMISE_S_MARKED; /* XXX: INVENTED! */
1753                 if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0 ||
1754                     vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE)
1755                         meta->type = PROMISE_T_RAID0;
1756                 else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
1757                     vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
1758                         meta->type = PROMISE_T_RAID1;
1759                 else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3)
1760                         meta->type = PROMISE_T_RAID3;
1761                 else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5)
1762                         meta->type = PROMISE_T_RAID5;
1763                 else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT)
1764                         meta->type = PROMISE_T_SPAN;
1765                 else
1766                         meta->type = PROMISE_T_JBOD;
1767                 meta->total_disks = vol->v_disks_count;
1768                 meta->stripe_shift = ffs(vol->v_strip_size / 1024);
1769                 meta->array_width = vol->v_disks_count;
1770                 if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
1771                     vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
1772                         meta->array_width /= 2;
1773                 meta->array_number = vol->v_global_id;
1774                 meta->total_sectors = vol->v_mediasize / 512;
1775                 meta->total_sectors_high = (vol->v_mediasize / 512) >> 32;
1776                 meta->sector_size = vol->v_sectorsize / 512;
1777                 meta->cylinders = meta->total_sectors / (255 * 63) - 1;
1778                 meta->heads = 254;
1779                 meta->sectors = 63;
1780                 meta->volume_id = pv->pv_id;
1781                 rebuild_lba64 = UINT64_MAX;
1782                 rebuild = 0;
1783                 for (i = 0; i < vol->v_disks_count; i++) {
1784                         sd = &vol->v_subdisks[i];
1785                         /* For RAID0+1 we need to translate order. */
1786                         pos = promise_meta_translate_disk(vol, i);
1787                         meta->disks[pos].flags = PROMISE_F_VALID |
1788                             PROMISE_F_ASSIGNED;
1789                         if (sd->sd_state == G_RAID_SUBDISK_S_NONE) {
1790                                 meta->disks[pos].flags |= 0;
1791                         } else if (sd->sd_state == G_RAID_SUBDISK_S_FAILED) {
1792                                 meta->disks[pos].flags |=
1793                                     PROMISE_F_DOWN | PROMISE_F_REDIR;
1794                         } else if (sd->sd_state <= G_RAID_SUBDISK_S_REBUILD) {
1795                                 meta->disks[pos].flags |=
1796                                     PROMISE_F_ONLINE | PROMISE_F_REDIR;
1797                                 if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) {
1798                                         rebuild_lba64 = MIN(rebuild_lba64,
1799                                             sd->sd_rebuild_pos / 512);
1800                                 } else
1801                                         rebuild_lba64 = 0;
1802                                 rebuild = 1;
1803                         } else {
1804                                 meta->disks[pos].flags |= PROMISE_F_ONLINE;
1805                                 if (sd->sd_state < G_RAID_SUBDISK_S_ACTIVE) {
1806                                         meta->status |= PROMISE_S_MARKED;
1807                                         if (sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
1808                                                 rebuild_lba64 = MIN(rebuild_lba64,
1809                                                     sd->sd_rebuild_pos / 512);
1810                                         } else
1811                                                 rebuild_lba64 = 0;
1812                                 }
1813                         }
1814                         if (pv->pv_meta != NULL) {
1815                                 meta->disks[pos].id = pv->pv_meta->disks[pos].id;
1816                         } else {
1817                                 meta->disks[pos].number = i * 2;
1818                                 arc4rand(&meta->disks[pos].id,
1819                                     sizeof(meta->disks[pos].id), 0);
1820                         }
1821                 }
1822                 promise_meta_put_name(meta, vol->v_name);
1823
1824                 /* Try to mimic AMD BIOS rebuild/resync behavior. */
1825                 if (rebuild_lba64 != UINT64_MAX) {
1826                         if (rebuild)
1827                                 meta->magic_3 = 0x03040010UL; /* Rebuild? */
1828                         else
1829                                 meta->magic_3 = 0x03040008UL; /* Resync? */
1830                         /* Translate from per-disk to per-volume LBA. */
1831                         if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
1832                             vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) {
1833                                 rebuild_lba64 *= meta->array_width;
1834                         } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 ||
1835                             vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) {
1836                                 rebuild_lba64 *= meta->array_width - 1;
1837                         } else
1838                                 rebuild_lba64 = 0;
1839                 } else
1840                         meta->magic_3 = 0x03000000UL;
1841                 meta->rebuild_lba64 = rebuild_lba64;
1842                 meta->magic_4 = 0x04010101UL;
1843
1844                 /* Replace per-volume metadata with new. */
1845                 if (pv->pv_meta != NULL)
1846                         free(pv->pv_meta, M_MD_PROMISE);
1847                 pv->pv_meta = meta;
1848
1849                 /* Copy new metadata to the disks, adding or replacing old. */
1850                 for (i = 0; i < vol->v_disks_count; i++) {
1851                         sd = &vol->v_subdisks[i];
1852                         disk = sd->sd_disk;
1853                         if (disk == NULL)
1854                                 continue;
1855                         /* For RAID0+1 we need to translate order. */
1856                         pos = promise_meta_translate_disk(vol, i);
1857                         pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
1858                         for (j = 0; j < pd->pd_subdisks; j++) {
1859                                 if (pd->pd_meta[j]->volume_id == meta->volume_id)
1860                                         break;
1861                         }
1862                         if (j == pd->pd_subdisks)
1863                                 pd->pd_subdisks++;
1864                         if (pd->pd_meta[j] != NULL)
1865                                 free(pd->pd_meta[j], M_MD_PROMISE);
1866                         pd->pd_meta[j] = promise_meta_copy(meta);
1867                         pd->pd_meta[j]->disk = meta->disks[pos];
1868                         pd->pd_meta[j]->disk.number = pos;
1869                         pd->pd_meta[j]->disk_offset_high =
1870                             (sd->sd_offset / 512) >> 32;
1871                         pd->pd_meta[j]->disk_offset = sd->sd_offset / 512;
1872                         pd->pd_meta[j]->disk_sectors_high =
1873                             (sd->sd_size / 512) >> 32;
1874                         pd->pd_meta[j]->disk_sectors = sd->sd_size / 512;
1875                         if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) {
1876                                 pd->pd_meta[j]->disk_rebuild_high =
1877                                     (sd->sd_rebuild_pos / 512) >> 32;
1878                                 pd->pd_meta[j]->disk_rebuild =
1879                                     sd->sd_rebuild_pos / 512;
1880                         } else if (sd->sd_state < G_RAID_SUBDISK_S_REBUILD) {
1881                                 pd->pd_meta[j]->disk_rebuild_high = 0;
1882                                 pd->pd_meta[j]->disk_rebuild = 0;
1883                         } else {
1884                                 pd->pd_meta[j]->disk_rebuild_high = UINT32_MAX;
1885                                 pd->pd_meta[j]->disk_rebuild = UINT32_MAX;
1886                         }
1887                         pd->pd_updated = 1;
1888                 }
1889         }
1890
1891         TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1892                 pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
1893                 if (disk->d_state != G_RAID_DISK_S_ACTIVE)
1894                         continue;
1895                 if (!pd->pd_updated)
1896                         continue;
1897                 G_RAID_DEBUG(1, "Writing Promise metadata to %s",
1898                     g_raid_get_diskname(disk));
1899                 for (i = 0; i < pd->pd_subdisks; i++)
1900                         g_raid_md_promise_print(pd->pd_meta[i]);
1901                 promise_meta_write(disk->d_consumer,
1902                     pd->pd_meta, pd->pd_subdisks);
1903                 pd->pd_updated = 0;
1904         }
1905
1906         return (0);
1907 }
1908
1909 static int
1910 g_raid_md_fail_disk_promise(struct g_raid_md_object *md,
1911     struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
1912 {
1913         struct g_raid_softc *sc;
1914         struct g_raid_md_promise_perdisk *pd;
1915         struct g_raid_subdisk *sd;
1916         int i, pos;
1917
1918         sc = md->mdo_softc;
1919         pd = (struct g_raid_md_promise_perdisk *)tdisk->d_md_data;
1920
1921         /* We can't fail disk that is not a part of array now. */
1922         if (tdisk->d_state != G_RAID_DISK_S_ACTIVE)
1923                 return (-1);
1924
1925         /*
1926          * Mark disk as failed in metadata and try to write that metadata
1927          * to the disk itself to prevent it's later resurrection as STALE.
1928          */
1929         if (pd->pd_subdisks > 0 && tdisk->d_consumer != NULL)
1930                 G_RAID_DEBUG(1, "Writing Promise metadata to %s",
1931                     g_raid_get_diskname(tdisk));
1932         for (i = 0; i < pd->pd_subdisks; i++) {
1933                 pd->pd_meta[i]->disk.flags |=
1934                     PROMISE_F_DOWN | PROMISE_F_REDIR;
1935                 pos = pd->pd_meta[i]->disk.number;
1936                 if (pos >= 0 && pos < PROMISE_MAX_DISKS) {
1937                         pd->pd_meta[i]->disks[pos].flags |=
1938                             PROMISE_F_DOWN | PROMISE_F_REDIR;
1939                 }
1940                 g_raid_md_promise_print(pd->pd_meta[i]);
1941         }
1942         if (tdisk->d_consumer != NULL)
1943                 promise_meta_write(tdisk->d_consumer,
1944                     pd->pd_meta, pd->pd_subdisks);
1945
1946         /* Change states. */
1947         g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED);
1948         TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) {
1949                 g_raid_change_subdisk_state(sd,
1950                     G_RAID_SUBDISK_S_FAILED);
1951                 g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED,
1952                     G_RAID_EVENT_SUBDISK);
1953         }
1954
1955         /* Write updated metadata to remaining disks. */
1956         g_raid_md_write_promise(md, NULL, NULL, tdisk);
1957
1958         g_raid_md_promise_refill(sc);
1959         return (0);
1960 }
1961
1962 static int
1963 g_raid_md_free_disk_promise(struct g_raid_md_object *md,
1964     struct g_raid_disk *disk)
1965 {
1966         struct g_raid_md_promise_perdisk *pd;
1967         int i;
1968
1969         pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
1970         for (i = 0; i < pd->pd_subdisks; i++) {
1971                 if (pd->pd_meta[i] != NULL) {
1972                         free(pd->pd_meta[i], M_MD_PROMISE);
1973                         pd->pd_meta[i] = NULL;
1974                 }
1975         }
1976         free(pd, M_MD_PROMISE);
1977         disk->d_md_data = NULL;
1978         return (0);
1979 }
1980
1981 static int
1982 g_raid_md_free_volume_promise(struct g_raid_md_object *md,
1983     struct g_raid_volume *vol)
1984 {
1985         struct g_raid_md_promise_pervolume *pv;
1986
1987         pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data;
1988         if (pv && pv->pv_meta != NULL) {
1989                 free(pv->pv_meta, M_MD_PROMISE);
1990                 pv->pv_meta = NULL;
1991         }
1992         if (pv && !pv->pv_started) {
1993                 pv->pv_started = 1;
1994                 callout_stop(&pv->pv_start_co);
1995         }
1996         free(pv, M_MD_PROMISE);
1997         vol->v_md_data = NULL;
1998         return (0);
1999 }
2000
2001 static int
2002 g_raid_md_free_promise(struct g_raid_md_object *md)
2003 {
2004
2005         return (0);
2006 }
2007
2008 G_RAID_MD_DECLARE(promise, "Promise");