]> CyberLeo.Net >> Repos - FreeBSD/releng/9.0.git/blob - sys/geom/raid3/g_raid3.c
Copy stable/9 to releng/9.0 as part of the FreeBSD 9.0-RELEASE release
[FreeBSD/releng/9.0.git] / sys / geom / raid3 / g_raid3.c
1 /*-
2  * Copyright (c) 2004-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/kernel.h>
33 #include <sys/module.h>
34 #include <sys/limits.h>
35 #include <sys/lock.h>
36 #include <sys/mutex.h>
37 #include <sys/bio.h>
38 #include <sys/sbuf.h>
39 #include <sys/sysctl.h>
40 #include <sys/malloc.h>
41 #include <sys/eventhandler.h>
42 #include <vm/uma.h>
43 #include <geom/geom.h>
44 #include <sys/proc.h>
45 #include <sys/kthread.h>
46 #include <sys/sched.h>
47 #include <geom/raid3/g_raid3.h>
48
49 FEATURE(geom_raid3, "GEOM RAID-3 functionality");
50
51 static MALLOC_DEFINE(M_RAID3, "raid3_data", "GEOM_RAID3 Data");
52
53 SYSCTL_DECL(_kern_geom);
54 SYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW, 0, "GEOM_RAID3 stuff");
55 u_int g_raid3_debug = 0;
56 TUNABLE_INT("kern.geom.raid3.debug", &g_raid3_debug);
57 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RW, &g_raid3_debug, 0,
58     "Debug level");
59 static u_int g_raid3_timeout = 4;
60 TUNABLE_INT("kern.geom.raid3.timeout", &g_raid3_timeout);
61 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RW, &g_raid3_timeout,
62     0, "Time to wait on all raid3 components");
63 static u_int g_raid3_idletime = 5;
64 TUNABLE_INT("kern.geom.raid3.idletime", &g_raid3_idletime);
65 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, idletime, CTLFLAG_RW,
66     &g_raid3_idletime, 0, "Mark components as clean when idling");
67 static u_int g_raid3_disconnect_on_failure = 1;
68 TUNABLE_INT("kern.geom.raid3.disconnect_on_failure",
69     &g_raid3_disconnect_on_failure);
70 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, disconnect_on_failure, CTLFLAG_RW,
71     &g_raid3_disconnect_on_failure, 0, "Disconnect component on I/O failure.");
72 static u_int g_raid3_syncreqs = 2;
73 TUNABLE_INT("kern.geom.raid3.sync_requests", &g_raid3_syncreqs);
74 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, sync_requests, CTLFLAG_RDTUN,
75     &g_raid3_syncreqs, 0, "Parallel synchronization I/O requests.");
76 static u_int g_raid3_use_malloc = 0;
77 TUNABLE_INT("kern.geom.raid3.use_malloc", &g_raid3_use_malloc);
78 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, use_malloc, CTLFLAG_RDTUN,
79     &g_raid3_use_malloc, 0, "Use malloc(9) instead of uma(9).");
80
81 static u_int g_raid3_n64k = 50;
82 TUNABLE_INT("kern.geom.raid3.n64k", &g_raid3_n64k);
83 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RD, &g_raid3_n64k, 0,
84     "Maximum number of 64kB allocations");
85 static u_int g_raid3_n16k = 200;
86 TUNABLE_INT("kern.geom.raid3.n16k", &g_raid3_n16k);
87 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RD, &g_raid3_n16k, 0,
88     "Maximum number of 16kB allocations");
89 static u_int g_raid3_n4k = 1200;
90 TUNABLE_INT("kern.geom.raid3.n4k", &g_raid3_n4k);
91 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RD, &g_raid3_n4k, 0,
92     "Maximum number of 4kB allocations");
93
94 SYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat, CTLFLAG_RW, 0,
95     "GEOM_RAID3 statistics");
96 static u_int g_raid3_parity_mismatch = 0;
97 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, parity_mismatch, CTLFLAG_RD,
98     &g_raid3_parity_mismatch, 0, "Number of failures in VERIFY mode");
99
100 #define MSLEEP(ident, mtx, priority, wmesg, timeout)    do {            \
101         G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));        \
102         msleep((ident), (mtx), (priority), (wmesg), (timeout));         \
103         G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident));        \
104 } while (0)
105
106 static eventhandler_tag g_raid3_pre_sync = NULL;
107
108 static int g_raid3_destroy_geom(struct gctl_req *req, struct g_class *mp,
109     struct g_geom *gp);
110 static g_taste_t g_raid3_taste;
111 static void g_raid3_init(struct g_class *mp);
112 static void g_raid3_fini(struct g_class *mp);
113
114 struct g_class g_raid3_class = {
115         .name = G_RAID3_CLASS_NAME,
116         .version = G_VERSION,
117         .ctlreq = g_raid3_config,
118         .taste = g_raid3_taste,
119         .destroy_geom = g_raid3_destroy_geom,
120         .init = g_raid3_init,
121         .fini = g_raid3_fini
122 };
123
124
125 static void g_raid3_destroy_provider(struct g_raid3_softc *sc);
126 static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state);
127 static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force);
128 static void g_raid3_dumpconf(struct sbuf *sb, const char *indent,
129     struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
130 static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type);
131 static int g_raid3_register_request(struct bio *pbp);
132 static void g_raid3_sync_release(struct g_raid3_softc *sc);
133
134
135 static const char *
136 g_raid3_disk_state2str(int state)
137 {
138
139         switch (state) {
140         case G_RAID3_DISK_STATE_NODISK:
141                 return ("NODISK");
142         case G_RAID3_DISK_STATE_NONE:
143                 return ("NONE");
144         case G_RAID3_DISK_STATE_NEW:
145                 return ("NEW");
146         case G_RAID3_DISK_STATE_ACTIVE:
147                 return ("ACTIVE");
148         case G_RAID3_DISK_STATE_STALE:
149                 return ("STALE");
150         case G_RAID3_DISK_STATE_SYNCHRONIZING:
151                 return ("SYNCHRONIZING");
152         case G_RAID3_DISK_STATE_DISCONNECTED:
153                 return ("DISCONNECTED");
154         default:
155                 return ("INVALID");
156         }
157 }
158
159 static const char *
160 g_raid3_device_state2str(int state)
161 {
162
163         switch (state) {
164         case G_RAID3_DEVICE_STATE_STARTING:
165                 return ("STARTING");
166         case G_RAID3_DEVICE_STATE_DEGRADED:
167                 return ("DEGRADED");
168         case G_RAID3_DEVICE_STATE_COMPLETE:
169                 return ("COMPLETE");
170         default:
171                 return ("INVALID");
172         }
173 }
174
175 const char *
176 g_raid3_get_diskname(struct g_raid3_disk *disk)
177 {
178
179         if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
180                 return ("[unknown]");
181         return (disk->d_name);
182 }
183
184 static void *
185 g_raid3_alloc(struct g_raid3_softc *sc, size_t size, int flags)
186 {
187         void *ptr;
188         enum g_raid3_zones zone;
189
190         if (g_raid3_use_malloc ||
191             (zone = g_raid3_zone(size)) == G_RAID3_NUM_ZONES)
192                 ptr = malloc(size, M_RAID3, flags);
193         else {
194                 ptr = uma_zalloc_arg(sc->sc_zones[zone].sz_zone,
195                    &sc->sc_zones[zone], flags);
196                 sc->sc_zones[zone].sz_requested++;
197                 if (ptr == NULL)
198                         sc->sc_zones[zone].sz_failed++;
199         }
200         return (ptr);
201 }
202
203 static void
204 g_raid3_free(struct g_raid3_softc *sc, void *ptr, size_t size)
205 {
206         enum g_raid3_zones zone;
207
208         if (g_raid3_use_malloc ||
209             (zone = g_raid3_zone(size)) == G_RAID3_NUM_ZONES)
210                 free(ptr, M_RAID3);
211         else {
212                 uma_zfree_arg(sc->sc_zones[zone].sz_zone,
213                     ptr, &sc->sc_zones[zone]);
214         }
215 }
216
217 static int
218 g_raid3_uma_ctor(void *mem, int size, void *arg, int flags)
219 {
220         struct g_raid3_zone *sz = arg;
221
222         if (sz->sz_max > 0 && sz->sz_inuse == sz->sz_max)
223                 return (ENOMEM);
224         sz->sz_inuse++;
225         return (0);
226 }
227
228 static void
229 g_raid3_uma_dtor(void *mem, int size, void *arg)
230 {
231         struct g_raid3_zone *sz = arg;
232
233         sz->sz_inuse--;
234 }
235
236 #define g_raid3_xor(src, dst, size)                                     \
237         _g_raid3_xor((uint64_t *)(src),                                 \
238             (uint64_t *)(dst), (size_t)size)
239 static void
240 _g_raid3_xor(uint64_t *src, uint64_t *dst, size_t size)
241 {
242
243         KASSERT((size % 128) == 0, ("Invalid size: %zu.", size));
244         for (; size > 0; size -= 128) {
245                 *dst++ ^= (*src++);
246                 *dst++ ^= (*src++);
247                 *dst++ ^= (*src++);
248                 *dst++ ^= (*src++);
249                 *dst++ ^= (*src++);
250                 *dst++ ^= (*src++);
251                 *dst++ ^= (*src++);
252                 *dst++ ^= (*src++);
253                 *dst++ ^= (*src++);
254                 *dst++ ^= (*src++);
255                 *dst++ ^= (*src++);
256                 *dst++ ^= (*src++);
257                 *dst++ ^= (*src++);
258                 *dst++ ^= (*src++);
259                 *dst++ ^= (*src++);
260                 *dst++ ^= (*src++);
261         }
262 }
263
264 static int
265 g_raid3_is_zero(struct bio *bp)
266 {
267         static const uint64_t zeros[] = {
268             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
269         };
270         u_char *addr;
271         ssize_t size;
272
273         size = bp->bio_length;
274         addr = (u_char *)bp->bio_data;
275         for (; size > 0; size -= sizeof(zeros), addr += sizeof(zeros)) {
276                 if (bcmp(addr, zeros, sizeof(zeros)) != 0)
277                         return (0);
278         }
279         return (1);
280 }
281
282 /*
283  * --- Events handling functions ---
284  * Events in geom_raid3 are used to maintain disks and device status
285  * from one thread to simplify locking.
286  */
287 static void
288 g_raid3_event_free(struct g_raid3_event *ep)
289 {
290
291         free(ep, M_RAID3);
292 }
293
294 int
295 g_raid3_event_send(void *arg, int state, int flags)
296 {
297         struct g_raid3_softc *sc;
298         struct g_raid3_disk *disk;
299         struct g_raid3_event *ep;
300         int error;
301
302         ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK);
303         G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep);
304         if ((flags & G_RAID3_EVENT_DEVICE) != 0) {
305                 disk = NULL;
306                 sc = arg;
307         } else {
308                 disk = arg;
309                 sc = disk->d_softc;
310         }
311         ep->e_disk = disk;
312         ep->e_state = state;
313         ep->e_flags = flags;
314         ep->e_error = 0;
315         mtx_lock(&sc->sc_events_mtx);
316         TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
317         mtx_unlock(&sc->sc_events_mtx);
318         G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
319         mtx_lock(&sc->sc_queue_mtx);
320         wakeup(sc);
321         wakeup(&sc->sc_queue);
322         mtx_unlock(&sc->sc_queue_mtx);
323         if ((flags & G_RAID3_EVENT_DONTWAIT) != 0)
324                 return (0);
325         sx_assert(&sc->sc_lock, SX_XLOCKED);
326         G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep);
327         sx_xunlock(&sc->sc_lock);
328         while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) {
329                 mtx_lock(&sc->sc_events_mtx);
330                 MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "r3:event",
331                     hz * 5);
332         }
333         error = ep->e_error;
334         g_raid3_event_free(ep);
335         sx_xlock(&sc->sc_lock);
336         return (error);
337 }
338
339 static struct g_raid3_event *
340 g_raid3_event_get(struct g_raid3_softc *sc)
341 {
342         struct g_raid3_event *ep;
343
344         mtx_lock(&sc->sc_events_mtx);
345         ep = TAILQ_FIRST(&sc->sc_events);
346         mtx_unlock(&sc->sc_events_mtx);
347         return (ep);
348 }
349
350 static void
351 g_raid3_event_remove(struct g_raid3_softc *sc, struct g_raid3_event *ep)
352 {
353
354         mtx_lock(&sc->sc_events_mtx);
355         TAILQ_REMOVE(&sc->sc_events, ep, e_next);
356         mtx_unlock(&sc->sc_events_mtx);
357 }
358
359 static void
360 g_raid3_event_cancel(struct g_raid3_disk *disk)
361 {
362         struct g_raid3_softc *sc;
363         struct g_raid3_event *ep, *tmpep;
364
365         sc = disk->d_softc;
366         sx_assert(&sc->sc_lock, SX_XLOCKED);
367
368         mtx_lock(&sc->sc_events_mtx);
369         TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
370                 if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0)
371                         continue;
372                 if (ep->e_disk != disk)
373                         continue;
374                 TAILQ_REMOVE(&sc->sc_events, ep, e_next);
375                 if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
376                         g_raid3_event_free(ep);
377                 else {
378                         ep->e_error = ECANCELED;
379                         wakeup(ep);
380                 }
381         }
382         mtx_unlock(&sc->sc_events_mtx);
383 }
384
385 /*
386  * Return the number of disks in the given state.
387  * If state is equal to -1, count all connected disks.
388  */
389 u_int
390 g_raid3_ndisks(struct g_raid3_softc *sc, int state)
391 {
392         struct g_raid3_disk *disk;
393         u_int n, ndisks;
394
395         sx_assert(&sc->sc_lock, SX_LOCKED);
396
397         for (n = ndisks = 0; n < sc->sc_ndisks; n++) {
398                 disk = &sc->sc_disks[n];
399                 if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
400                         continue;
401                 if (state == -1 || disk->d_state == state)
402                         ndisks++;
403         }
404         return (ndisks);
405 }
406
407 static u_int
408 g_raid3_nrequests(struct g_raid3_softc *sc, struct g_consumer *cp)
409 {
410         struct bio *bp;
411         u_int nreqs = 0;
412
413         mtx_lock(&sc->sc_queue_mtx);
414         TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
415                 if (bp->bio_from == cp)
416                         nreqs++;
417         }
418         mtx_unlock(&sc->sc_queue_mtx);
419         return (nreqs);
420 }
421
422 static int
423 g_raid3_is_busy(struct g_raid3_softc *sc, struct g_consumer *cp)
424 {
425
426         if (cp->index > 0) {
427                 G_RAID3_DEBUG(2,
428                     "I/O requests for %s exist, can't destroy it now.",
429                     cp->provider->name);
430                 return (1);
431         }
432         if (g_raid3_nrequests(sc, cp) > 0) {
433                 G_RAID3_DEBUG(2,
434                     "I/O requests for %s in queue, can't destroy it now.",
435                     cp->provider->name);
436                 return (1);
437         }
438         return (0);
439 }
440
441 static void
442 g_raid3_destroy_consumer(void *arg, int flags __unused)
443 {
444         struct g_consumer *cp;
445
446         g_topology_assert();
447
448         cp = arg;
449         G_RAID3_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
450         g_detach(cp);
451         g_destroy_consumer(cp);
452 }
453
454 static void
455 g_raid3_kill_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
456 {
457         struct g_provider *pp;
458         int retaste_wait;
459
460         g_topology_assert();
461
462         cp->private = NULL;
463         if (g_raid3_is_busy(sc, cp))
464                 return;
465         G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name);
466         pp = cp->provider;
467         retaste_wait = 0;
468         if (cp->acw == 1) {
469                 if ((pp->geom->flags & G_GEOM_WITHER) == 0)
470                         retaste_wait = 1;
471         }
472         G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr,
473             -cp->acw, -cp->ace, 0);
474         if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
475                 g_access(cp, -cp->acr, -cp->acw, -cp->ace);
476         if (retaste_wait) {
477                 /*
478                  * After retaste event was send (inside g_access()), we can send
479                  * event to detach and destroy consumer.
480                  * A class, which has consumer to the given provider connected
481                  * will not receive retaste event for the provider.
482                  * This is the way how I ignore retaste events when I close
483                  * consumers opened for write: I detach and destroy consumer
484                  * after retaste event is sent.
485                  */
486                 g_post_event(g_raid3_destroy_consumer, cp, M_WAITOK, NULL);
487                 return;
488         }
489         G_RAID3_DEBUG(1, "Consumer %s destroyed.", pp->name);
490         g_detach(cp);
491         g_destroy_consumer(cp);
492 }
493
494 static int
495 g_raid3_connect_disk(struct g_raid3_disk *disk, struct g_provider *pp)
496 {
497         struct g_consumer *cp;
498         int error;
499
500         g_topology_assert_not();
501         KASSERT(disk->d_consumer == NULL,
502             ("Disk already connected (device %s).", disk->d_softc->sc_name));
503
504         g_topology_lock();
505         cp = g_new_consumer(disk->d_softc->sc_geom);
506         error = g_attach(cp, pp);
507         if (error != 0) {
508                 g_destroy_consumer(cp);
509                 g_topology_unlock();
510                 return (error);
511         }
512         error = g_access(cp, 1, 1, 1);
513                 g_topology_unlock();
514         if (error != 0) {
515                 g_detach(cp);
516                 g_destroy_consumer(cp);
517                 G_RAID3_DEBUG(0, "Cannot open consumer %s (error=%d).",
518                     pp->name, error);
519                 return (error);
520         }
521         disk->d_consumer = cp;
522         disk->d_consumer->private = disk;
523         disk->d_consumer->index = 0;
524         G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk));
525         return (0);
526 }
527
528 static void
529 g_raid3_disconnect_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
530 {
531
532         g_topology_assert();
533
534         if (cp == NULL)
535                 return;
536         if (cp->provider != NULL)
537                 g_raid3_kill_consumer(sc, cp);
538         else
539                 g_destroy_consumer(cp);
540 }
541
542 /*
543  * Initialize disk. This means allocate memory, create consumer, attach it
544  * to the provider and open access (r1w1e1) to it.
545  */
546 static struct g_raid3_disk *
547 g_raid3_init_disk(struct g_raid3_softc *sc, struct g_provider *pp,
548     struct g_raid3_metadata *md, int *errorp)
549 {
550         struct g_raid3_disk *disk;
551         int error;
552
553         disk = &sc->sc_disks[md->md_no];
554         error = g_raid3_connect_disk(disk, pp);
555         if (error != 0) {
556                 if (errorp != NULL)
557                         *errorp = error;
558                 return (NULL);
559         }
560         disk->d_state = G_RAID3_DISK_STATE_NONE;
561         disk->d_flags = md->md_dflags;
562         if (md->md_provider[0] != '\0')
563                 disk->d_flags |= G_RAID3_DISK_FLAG_HARDCODED;
564         disk->d_sync.ds_consumer = NULL;
565         disk->d_sync.ds_offset = md->md_sync_offset;
566         disk->d_sync.ds_offset_done = md->md_sync_offset;
567         disk->d_genid = md->md_genid;
568         disk->d_sync.ds_syncid = md->md_syncid;
569         if (errorp != NULL)
570                 *errorp = 0;
571         return (disk);
572 }
573
574 static void
575 g_raid3_destroy_disk(struct g_raid3_disk *disk)
576 {
577         struct g_raid3_softc *sc;
578
579         g_topology_assert_not();
580         sc = disk->d_softc;
581         sx_assert(&sc->sc_lock, SX_XLOCKED);
582
583         if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
584                 return;
585         g_raid3_event_cancel(disk);
586         switch (disk->d_state) {
587         case G_RAID3_DISK_STATE_SYNCHRONIZING:
588                 if (sc->sc_syncdisk != NULL)
589                         g_raid3_sync_stop(sc, 1);
590                 /* FALLTHROUGH */
591         case G_RAID3_DISK_STATE_NEW:
592         case G_RAID3_DISK_STATE_STALE:
593         case G_RAID3_DISK_STATE_ACTIVE:
594                 g_topology_lock();
595                 g_raid3_disconnect_consumer(sc, disk->d_consumer);
596                 g_topology_unlock();
597                 disk->d_consumer = NULL;
598                 break;
599         default:
600                 KASSERT(0 == 1, ("Wrong disk state (%s, %s).",
601                     g_raid3_get_diskname(disk),
602                     g_raid3_disk_state2str(disk->d_state)));
603         }
604         disk->d_state = G_RAID3_DISK_STATE_NODISK;
605 }
606
607 static void
608 g_raid3_destroy_device(struct g_raid3_softc *sc)
609 {
610         struct g_raid3_event *ep;
611         struct g_raid3_disk *disk;
612         struct g_geom *gp;
613         struct g_consumer *cp;
614         u_int n;
615
616         g_topology_assert_not();
617         sx_assert(&sc->sc_lock, SX_XLOCKED);
618
619         gp = sc->sc_geom;
620         if (sc->sc_provider != NULL)
621                 g_raid3_destroy_provider(sc);
622         for (n = 0; n < sc->sc_ndisks; n++) {
623                 disk = &sc->sc_disks[n];
624                 if (disk->d_state != G_RAID3_DISK_STATE_NODISK) {
625                         disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
626                         g_raid3_update_metadata(disk);
627                         g_raid3_destroy_disk(disk);
628                 }
629         }
630         while ((ep = g_raid3_event_get(sc)) != NULL) {
631                 g_raid3_event_remove(sc, ep);
632                 if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
633                         g_raid3_event_free(ep);
634                 else {
635                         ep->e_error = ECANCELED;
636                         ep->e_flags |= G_RAID3_EVENT_DONE;
637                         G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep);
638                         mtx_lock(&sc->sc_events_mtx);
639                         wakeup(ep);
640                         mtx_unlock(&sc->sc_events_mtx);
641                 }
642         }
643         callout_drain(&sc->sc_callout);
644         cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer);
645         g_topology_lock();
646         if (cp != NULL)
647                 g_raid3_disconnect_consumer(sc, cp);
648         g_wither_geom(sc->sc_sync.ds_geom, ENXIO);
649         G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name);
650         g_wither_geom(gp, ENXIO);
651         g_topology_unlock();
652         if (!g_raid3_use_malloc) {
653                 uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone);
654                 uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone);
655                 uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone);
656         }
657         mtx_destroy(&sc->sc_queue_mtx);
658         mtx_destroy(&sc->sc_events_mtx);
659         sx_xunlock(&sc->sc_lock);
660         sx_destroy(&sc->sc_lock);
661 }
662
663 static void
664 g_raid3_orphan(struct g_consumer *cp)
665 {
666         struct g_raid3_disk *disk;
667
668         g_topology_assert();
669
670         disk = cp->private;
671         if (disk == NULL)
672                 return;
673         disk->d_softc->sc_bump_id = G_RAID3_BUMP_SYNCID;
674         g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
675             G_RAID3_EVENT_DONTWAIT);
676 }
677
678 static int
679 g_raid3_write_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
680 {
681         struct g_raid3_softc *sc;
682         struct g_consumer *cp;
683         off_t offset, length;
684         u_char *sector;
685         int error = 0;
686
687         g_topology_assert_not();
688         sc = disk->d_softc;
689         sx_assert(&sc->sc_lock, SX_LOCKED);
690
691         cp = disk->d_consumer;
692         KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name));
693         KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name));
694         KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
695             ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr,
696             cp->acw, cp->ace));
697         length = cp->provider->sectorsize;
698         offset = cp->provider->mediasize - length;
699         sector = malloc((size_t)length, M_RAID3, M_WAITOK | M_ZERO);
700         if (md != NULL)
701                 raid3_metadata_encode(md, sector);
702         error = g_write_data(cp, offset, sector, length);
703         free(sector, M_RAID3);
704         if (error != 0) {
705                 if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) {
706                         G_RAID3_DEBUG(0, "Cannot write metadata on %s "
707                             "(device=%s, error=%d).",
708                             g_raid3_get_diskname(disk), sc->sc_name, error);
709                         disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN;
710                 } else {
711                         G_RAID3_DEBUG(1, "Cannot write metadata on %s "
712                             "(device=%s, error=%d).",
713                             g_raid3_get_diskname(disk), sc->sc_name, error);
714                 }
715                 if (g_raid3_disconnect_on_failure &&
716                     sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
717                         sc->sc_bump_id |= G_RAID3_BUMP_GENID;
718                         g_raid3_event_send(disk,
719                             G_RAID3_DISK_STATE_DISCONNECTED,
720                             G_RAID3_EVENT_DONTWAIT);
721                 }
722         }
723         return (error);
724 }
725
726 int
727 g_raid3_clear_metadata(struct g_raid3_disk *disk)
728 {
729         int error;
730
731         g_topology_assert_not();
732         sx_assert(&disk->d_softc->sc_lock, SX_LOCKED);
733
734         error = g_raid3_write_metadata(disk, NULL);
735         if (error == 0) {
736                 G_RAID3_DEBUG(2, "Metadata on %s cleared.",
737                     g_raid3_get_diskname(disk));
738         } else {
739                 G_RAID3_DEBUG(0,
740                     "Cannot clear metadata on disk %s (error=%d).",
741                     g_raid3_get_diskname(disk), error);
742         }
743         return (error);
744 }
745
746 void
747 g_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
748 {
749         struct g_raid3_softc *sc;
750         struct g_provider *pp;
751
752         sc = disk->d_softc;
753         strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic));
754         md->md_version = G_RAID3_VERSION;
755         strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name));
756         md->md_id = sc->sc_id;
757         md->md_all = sc->sc_ndisks;
758         md->md_genid = sc->sc_genid;
759         md->md_mediasize = sc->sc_mediasize;
760         md->md_sectorsize = sc->sc_sectorsize;
761         md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK);
762         md->md_no = disk->d_no;
763         md->md_syncid = disk->d_sync.ds_syncid;
764         md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK);
765         if (disk->d_state != G_RAID3_DISK_STATE_SYNCHRONIZING)
766                 md->md_sync_offset = 0;
767         else {
768                 md->md_sync_offset =
769                     disk->d_sync.ds_offset_done / (sc->sc_ndisks - 1);
770         }
771         if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL)
772                 pp = disk->d_consumer->provider;
773         else
774                 pp = NULL;
775         if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 && pp != NULL)
776                 strlcpy(md->md_provider, pp->name, sizeof(md->md_provider));
777         else
778                 bzero(md->md_provider, sizeof(md->md_provider));
779         if (pp != NULL)
780                 md->md_provsize = pp->mediasize;
781         else
782                 md->md_provsize = 0;
783 }
784
785 void
786 g_raid3_update_metadata(struct g_raid3_disk *disk)
787 {
788         struct g_raid3_softc *sc;
789         struct g_raid3_metadata md;
790         int error;
791
792         g_topology_assert_not();
793         sc = disk->d_softc;
794         sx_assert(&sc->sc_lock, SX_LOCKED);
795
796         g_raid3_fill_metadata(disk, &md);
797         error = g_raid3_write_metadata(disk, &md);
798         if (error == 0) {
799                 G_RAID3_DEBUG(2, "Metadata on %s updated.",
800                     g_raid3_get_diskname(disk));
801         } else {
802                 G_RAID3_DEBUG(0,
803                     "Cannot update metadata on disk %s (error=%d).",
804                     g_raid3_get_diskname(disk), error);
805         }
806 }
807
808 static void
809 g_raid3_bump_syncid(struct g_raid3_softc *sc)
810 {
811         struct g_raid3_disk *disk;
812         u_int n;
813
814         g_topology_assert_not();
815         sx_assert(&sc->sc_lock, SX_XLOCKED);
816         KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
817             ("%s called with no active disks (device=%s).", __func__,
818             sc->sc_name));
819
820         sc->sc_syncid++;
821         G_RAID3_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name,
822             sc->sc_syncid);
823         for (n = 0; n < sc->sc_ndisks; n++) {
824                 disk = &sc->sc_disks[n];
825                 if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
826                     disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
827                         disk->d_sync.ds_syncid = sc->sc_syncid;
828                         g_raid3_update_metadata(disk);
829                 }
830         }
831 }
832
833 static void
834 g_raid3_bump_genid(struct g_raid3_softc *sc)
835 {
836         struct g_raid3_disk *disk;
837         u_int n;
838
839         g_topology_assert_not();
840         sx_assert(&sc->sc_lock, SX_XLOCKED);
841         KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
842             ("%s called with no active disks (device=%s).", __func__,
843             sc->sc_name));
844
845         sc->sc_genid++;
846         G_RAID3_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name,
847             sc->sc_genid);
848         for (n = 0; n < sc->sc_ndisks; n++) {
849                 disk = &sc->sc_disks[n];
850                 if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
851                     disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
852                         disk->d_genid = sc->sc_genid;
853                         g_raid3_update_metadata(disk);
854                 }
855         }
856 }
857
858 static int
859 g_raid3_idle(struct g_raid3_softc *sc, int acw)
860 {
861         struct g_raid3_disk *disk;
862         u_int i;
863         int timeout;
864
865         g_topology_assert_not();
866         sx_assert(&sc->sc_lock, SX_XLOCKED);
867
868         if (sc->sc_provider == NULL)
869                 return (0);
870         if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0)
871                 return (0);
872         if (sc->sc_idle)
873                 return (0);
874         if (sc->sc_writes > 0)
875                 return (0);
876         if (acw > 0 || (acw == -1 && sc->sc_provider->acw > 0)) {
877                 timeout = g_raid3_idletime - (time_uptime - sc->sc_last_write);
878                 if (timeout > 0)
879                         return (timeout);
880         }
881         sc->sc_idle = 1;
882         for (i = 0; i < sc->sc_ndisks; i++) {
883                 disk = &sc->sc_disks[i];
884                 if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
885                         continue;
886                 G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
887                     g_raid3_get_diskname(disk), sc->sc_name);
888                 disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
889                 g_raid3_update_metadata(disk);
890         }
891         return (0);
892 }
893
894 static void
895 g_raid3_unidle(struct g_raid3_softc *sc)
896 {
897         struct g_raid3_disk *disk;
898         u_int i;
899
900         g_topology_assert_not();
901         sx_assert(&sc->sc_lock, SX_XLOCKED);
902
903         if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0)
904                 return;
905         sc->sc_idle = 0;
906         sc->sc_last_write = time_uptime;
907         for (i = 0; i < sc->sc_ndisks; i++) {
908                 disk = &sc->sc_disks[i];
909                 if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
910                         continue;
911                 G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
912                     g_raid3_get_diskname(disk), sc->sc_name);
913                 disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
914                 g_raid3_update_metadata(disk);
915         }
916 }
917
918 /*
919  * Treat bio_driver1 field in parent bio as list head and field bio_caller1
920  * in child bio as pointer to the next element on the list.
921  */
922 #define G_RAID3_HEAD_BIO(pbp)   (pbp)->bio_driver1
923
924 #define G_RAID3_NEXT_BIO(cbp)   (cbp)->bio_caller1
925
926 #define G_RAID3_FOREACH_BIO(pbp, bp)                                    \
927         for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL;                \
928             (bp) = G_RAID3_NEXT_BIO(bp))
929
930 #define G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp)                        \
931         for ((bp) = G_RAID3_HEAD_BIO(pbp);                              \
932             (bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1);        \
933             (bp) = (tmpbp))
934
935 static void
936 g_raid3_init_bio(struct bio *pbp)
937 {
938
939         G_RAID3_HEAD_BIO(pbp) = NULL;
940 }
941
942 static void
943 g_raid3_remove_bio(struct bio *cbp)
944 {
945         struct bio *pbp, *bp;
946
947         pbp = cbp->bio_parent;
948         if (G_RAID3_HEAD_BIO(pbp) == cbp)
949                 G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
950         else {
951                 G_RAID3_FOREACH_BIO(pbp, bp) {
952                         if (G_RAID3_NEXT_BIO(bp) == cbp) {
953                                 G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
954                                 break;
955                         }
956                 }
957         }
958         G_RAID3_NEXT_BIO(cbp) = NULL;
959 }
960
961 static void
962 g_raid3_replace_bio(struct bio *sbp, struct bio *dbp)
963 {
964         struct bio *pbp, *bp;
965
966         g_raid3_remove_bio(sbp);
967         pbp = dbp->bio_parent;
968         G_RAID3_NEXT_BIO(sbp) = G_RAID3_NEXT_BIO(dbp);
969         if (G_RAID3_HEAD_BIO(pbp) == dbp)
970                 G_RAID3_HEAD_BIO(pbp) = sbp;
971         else {
972                 G_RAID3_FOREACH_BIO(pbp, bp) {
973                         if (G_RAID3_NEXT_BIO(bp) == dbp) {
974                                 G_RAID3_NEXT_BIO(bp) = sbp;
975                                 break;
976                         }
977                 }
978         }
979         G_RAID3_NEXT_BIO(dbp) = NULL;
980 }
981
982 static void
983 g_raid3_destroy_bio(struct g_raid3_softc *sc, struct bio *cbp)
984 {
985         struct bio *bp, *pbp;
986         size_t size;
987
988         pbp = cbp->bio_parent;
989         pbp->bio_children--;
990         KASSERT(cbp->bio_data != NULL, ("NULL bio_data"));
991         size = pbp->bio_length / (sc->sc_ndisks - 1);
992         g_raid3_free(sc, cbp->bio_data, size);
993         if (G_RAID3_HEAD_BIO(pbp) == cbp) {
994                 G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
995                 G_RAID3_NEXT_BIO(cbp) = NULL;
996                 g_destroy_bio(cbp);
997         } else {
998                 G_RAID3_FOREACH_BIO(pbp, bp) {
999                         if (G_RAID3_NEXT_BIO(bp) == cbp)
1000                                 break;
1001                 }
1002                 if (bp != NULL) {
1003                         KASSERT(G_RAID3_NEXT_BIO(bp) != NULL,
1004                             ("NULL bp->bio_driver1"));
1005                         G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
1006                         G_RAID3_NEXT_BIO(cbp) = NULL;
1007                 }
1008                 g_destroy_bio(cbp);
1009         }
1010 }
1011
1012 static struct bio *
1013 g_raid3_clone_bio(struct g_raid3_softc *sc, struct bio *pbp)
1014 {
1015         struct bio *bp, *cbp;
1016         size_t size;
1017         int memflag;
1018
1019         cbp = g_clone_bio(pbp);
1020         if (cbp == NULL)
1021                 return (NULL);
1022         size = pbp->bio_length / (sc->sc_ndisks - 1);
1023         if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0)
1024                 memflag = M_WAITOK;
1025         else
1026                 memflag = M_NOWAIT;
1027         cbp->bio_data = g_raid3_alloc(sc, size, memflag);
1028         if (cbp->bio_data == NULL) {
1029                 pbp->bio_children--;
1030                 g_destroy_bio(cbp);
1031                 return (NULL);
1032         }
1033         G_RAID3_NEXT_BIO(cbp) = NULL;
1034         if (G_RAID3_HEAD_BIO(pbp) == NULL)
1035                 G_RAID3_HEAD_BIO(pbp) = cbp;
1036         else {
1037                 G_RAID3_FOREACH_BIO(pbp, bp) {
1038                         if (G_RAID3_NEXT_BIO(bp) == NULL) {
1039                                 G_RAID3_NEXT_BIO(bp) = cbp;
1040                                 break;
1041                         }
1042                 }
1043         }
1044         return (cbp);
1045 }
1046
1047 static void
1048 g_raid3_scatter(struct bio *pbp)
1049 {
1050         struct g_raid3_softc *sc;
1051         struct g_raid3_disk *disk;
1052         struct bio *bp, *cbp, *tmpbp;
1053         off_t atom, cadd, padd, left;
1054         int first;
1055
1056         sc = pbp->bio_to->geom->softc;
1057         bp = NULL;
1058         if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
1059                 /*
1060                  * Find bio for which we should calculate data.
1061                  */
1062                 G_RAID3_FOREACH_BIO(pbp, cbp) {
1063                         if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
1064                                 bp = cbp;
1065                                 break;
1066                         }
1067                 }
1068                 KASSERT(bp != NULL, ("NULL parity bio."));
1069         }
1070         atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1071         cadd = padd = 0;
1072         for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
1073                 G_RAID3_FOREACH_BIO(pbp, cbp) {
1074                         if (cbp == bp)
1075                                 continue;
1076                         bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom);
1077                         padd += atom;
1078                 }
1079                 cadd += atom;
1080         }
1081         if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
1082                 /*
1083                  * Calculate parity.
1084                  */
1085                 first = 1;
1086                 G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
1087                         if (cbp == bp)
1088                                 continue;
1089                         if (first) {
1090                                 bcopy(cbp->bio_data, bp->bio_data,
1091                                     bp->bio_length);
1092                                 first = 0;
1093                         } else {
1094                                 g_raid3_xor(cbp->bio_data, bp->bio_data,
1095                                     bp->bio_length);
1096                         }
1097                         if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0)
1098                                 g_raid3_destroy_bio(sc, cbp);
1099                 }
1100         }
1101         G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
1102                 struct g_consumer *cp;
1103
1104                 disk = cbp->bio_caller2;
1105                 cp = disk->d_consumer;
1106                 cbp->bio_to = cp->provider;
1107                 G_RAID3_LOGREQ(3, cbp, "Sending request.");
1108                 KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1109                     ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1110                     cp->acr, cp->acw, cp->ace));
1111                 cp->index++;
1112                 sc->sc_writes++;
1113                 g_io_request(cbp, cp);
1114         }
1115 }
1116
1117 static void
1118 g_raid3_gather(struct bio *pbp)
1119 {
1120         struct g_raid3_softc *sc;
1121         struct g_raid3_disk *disk;
1122         struct bio *xbp, *fbp, *cbp;
1123         off_t atom, cadd, padd, left;
1124
1125         sc = pbp->bio_to->geom->softc;
1126         /*
1127          * Find bio for which we have to calculate data.
1128          * While going through this path, check if all requests
1129          * succeeded, if not, deny whole request.
1130          * If we're in COMPLETE mode, we allow one request to fail,
1131          * so if we find one, we're sending it to the parity consumer.
1132          * If there are more failed requests, we deny whole request.
1133          */
1134         xbp = fbp = NULL;
1135         G_RAID3_FOREACH_BIO(pbp, cbp) {
1136                 if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
1137                         KASSERT(xbp == NULL, ("More than one parity bio."));
1138                         xbp = cbp;
1139                 }
1140                 if (cbp->bio_error == 0)
1141                         continue;
1142                 /*
1143                  * Found failed request.
1144                  */
1145                 if (fbp == NULL) {
1146                         if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) {
1147                                 /*
1148                                  * We are already in degraded mode, so we can't
1149                                  * accept any failures.
1150                                  */
1151                                 if (pbp->bio_error == 0)
1152                                         pbp->bio_error = cbp->bio_error;
1153                         } else {
1154                                 fbp = cbp;
1155                         }
1156                 } else {
1157                         /*
1158                          * Next failed request, that's too many.
1159                          */
1160                         if (pbp->bio_error == 0)
1161                                 pbp->bio_error = fbp->bio_error;
1162                 }
1163                 disk = cbp->bio_caller2;
1164                 if (disk == NULL)
1165                         continue;
1166                 if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) {
1167                         disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN;
1168                         G_RAID3_LOGREQ(0, cbp, "Request failed (error=%d).",
1169                             cbp->bio_error);
1170                 } else {
1171                         G_RAID3_LOGREQ(1, cbp, "Request failed (error=%d).",
1172                             cbp->bio_error);
1173                 }
1174                 if (g_raid3_disconnect_on_failure &&
1175                     sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1176                         sc->sc_bump_id |= G_RAID3_BUMP_GENID;
1177                         g_raid3_event_send(disk,
1178                             G_RAID3_DISK_STATE_DISCONNECTED,
1179                             G_RAID3_EVENT_DONTWAIT);
1180                 }
1181         }
1182         if (pbp->bio_error != 0)
1183                 goto finish;
1184         if (fbp != NULL && (pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
1185                 pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_VERIFY;
1186                 if (xbp != fbp)
1187                         g_raid3_replace_bio(xbp, fbp);
1188                 g_raid3_destroy_bio(sc, fbp);
1189         } else if (fbp != NULL) {
1190                 struct g_consumer *cp;
1191
1192                 /*
1193                  * One request failed, so send the same request to
1194                  * the parity consumer.
1195                  */
1196                 disk = pbp->bio_driver2;
1197                 if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
1198                         pbp->bio_error = fbp->bio_error;
1199                         goto finish;
1200                 }
1201                 pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1202                 pbp->bio_inbed--;
1203                 fbp->bio_flags &= ~(BIO_DONE | BIO_ERROR);
1204                 if (disk->d_no == sc->sc_ndisks - 1)
1205                         fbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1206                 fbp->bio_error = 0;
1207                 fbp->bio_completed = 0;
1208                 fbp->bio_children = 0;
1209                 fbp->bio_inbed = 0;
1210                 cp = disk->d_consumer;
1211                 fbp->bio_caller2 = disk;
1212                 fbp->bio_to = cp->provider;
1213                 G_RAID3_LOGREQ(3, fbp, "Sending request (recover).");
1214                 KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1215                     ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1216                     cp->acr, cp->acw, cp->ace));
1217                 cp->index++;
1218                 g_io_request(fbp, cp);
1219                 return;
1220         }
1221         if (xbp != NULL) {
1222                 /*
1223                  * Calculate parity.
1224                  */
1225                 G_RAID3_FOREACH_BIO(pbp, cbp) {
1226                         if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0)
1227                                 continue;
1228                         g_raid3_xor(cbp->bio_data, xbp->bio_data,
1229                             xbp->bio_length);
1230                 }
1231                 xbp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY;
1232                 if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
1233                         if (!g_raid3_is_zero(xbp)) {
1234                                 g_raid3_parity_mismatch++;
1235                                 pbp->bio_error = EIO;
1236                                 goto finish;
1237                         }
1238                         g_raid3_destroy_bio(sc, xbp);
1239                 }
1240         }
1241         atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1242         cadd = padd = 0;
1243         for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
1244                 G_RAID3_FOREACH_BIO(pbp, cbp) {
1245                         bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom);
1246                         pbp->bio_completed += atom;
1247                         padd += atom;
1248                 }
1249                 cadd += atom;
1250         }
1251 finish:
1252         if (pbp->bio_error == 0)
1253                 G_RAID3_LOGREQ(3, pbp, "Request finished.");
1254         else {
1255                 if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0)
1256                         G_RAID3_LOGREQ(1, pbp, "Verification error.");
1257                 else
1258                         G_RAID3_LOGREQ(0, pbp, "Request failed.");
1259         }
1260         pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_MASK;
1261         while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
1262                 g_raid3_destroy_bio(sc, cbp);
1263         g_io_deliver(pbp, pbp->bio_error);
1264 }
1265
1266 static void
1267 g_raid3_done(struct bio *bp)
1268 {
1269         struct g_raid3_softc *sc;
1270
1271         sc = bp->bio_from->geom->softc;
1272         bp->bio_cflags |= G_RAID3_BIO_CFLAG_REGULAR;
1273         G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error);
1274         mtx_lock(&sc->sc_queue_mtx);
1275         bioq_insert_head(&sc->sc_queue, bp);
1276         mtx_unlock(&sc->sc_queue_mtx);
1277         wakeup(sc);
1278         wakeup(&sc->sc_queue);
1279 }
1280
1281 static void
1282 g_raid3_regular_request(struct bio *cbp)
1283 {
1284         struct g_raid3_softc *sc;
1285         struct g_raid3_disk *disk;
1286         struct bio *pbp;
1287
1288         g_topology_assert_not();
1289
1290         pbp = cbp->bio_parent;
1291         sc = pbp->bio_to->geom->softc;
1292         cbp->bio_from->index--;
1293         if (cbp->bio_cmd == BIO_WRITE)
1294                 sc->sc_writes--;
1295         disk = cbp->bio_from->private;
1296         if (disk == NULL) {
1297                 g_topology_lock();
1298                 g_raid3_kill_consumer(sc, cbp->bio_from);
1299                 g_topology_unlock();
1300         }
1301
1302         G_RAID3_LOGREQ(3, cbp, "Request finished.");
1303         pbp->bio_inbed++;
1304         KASSERT(pbp->bio_inbed <= pbp->bio_children,
1305             ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed,
1306             pbp->bio_children));
1307         if (pbp->bio_inbed != pbp->bio_children)
1308                 return;
1309         switch (pbp->bio_cmd) {
1310         case BIO_READ:
1311                 g_raid3_gather(pbp);
1312                 break;
1313         case BIO_WRITE:
1314         case BIO_DELETE:
1315             {
1316                 int error = 0;
1317
1318                 pbp->bio_completed = pbp->bio_length;
1319                 while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) {
1320                         if (cbp->bio_error == 0) {
1321                                 g_raid3_destroy_bio(sc, cbp);
1322                                 continue;
1323                         }
1324
1325                         if (error == 0)
1326                                 error = cbp->bio_error;
1327                         else if (pbp->bio_error == 0) {
1328                                 /*
1329                                  * Next failed request, that's too many.
1330                                  */
1331                                 pbp->bio_error = error;
1332                         }
1333
1334                         disk = cbp->bio_caller2;
1335                         if (disk == NULL) {
1336                                 g_raid3_destroy_bio(sc, cbp);
1337                                 continue;
1338                         }
1339
1340                         if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) {
1341                                 disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN;
1342                                 G_RAID3_LOGREQ(0, cbp,
1343                                     "Request failed (error=%d).",
1344                                     cbp->bio_error);
1345                         } else {
1346                                 G_RAID3_LOGREQ(1, cbp,
1347                                     "Request failed (error=%d).",
1348                                     cbp->bio_error);
1349                         }
1350                         if (g_raid3_disconnect_on_failure &&
1351                             sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1352                                 sc->sc_bump_id |= G_RAID3_BUMP_GENID;
1353                                 g_raid3_event_send(disk,
1354                                     G_RAID3_DISK_STATE_DISCONNECTED,
1355                                     G_RAID3_EVENT_DONTWAIT);
1356                         }
1357                         g_raid3_destroy_bio(sc, cbp);
1358                 }
1359                 if (pbp->bio_error == 0)
1360                         G_RAID3_LOGREQ(3, pbp, "Request finished.");
1361                 else
1362                         G_RAID3_LOGREQ(0, pbp, "Request failed.");
1363                 pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED;
1364                 pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY;
1365                 bioq_remove(&sc->sc_inflight, pbp);
1366                 /* Release delayed sync requests if possible. */
1367                 g_raid3_sync_release(sc);
1368                 g_io_deliver(pbp, pbp->bio_error);
1369                 break;
1370             }
1371         }
1372 }
1373
1374 static void
1375 g_raid3_sync_done(struct bio *bp)
1376 {
1377         struct g_raid3_softc *sc;
1378
1379         G_RAID3_LOGREQ(3, bp, "Synchronization request delivered.");
1380         sc = bp->bio_from->geom->softc;
1381         bp->bio_cflags |= G_RAID3_BIO_CFLAG_SYNC;
1382         mtx_lock(&sc->sc_queue_mtx);
1383         bioq_insert_head(&sc->sc_queue, bp);
1384         mtx_unlock(&sc->sc_queue_mtx);
1385         wakeup(sc);
1386         wakeup(&sc->sc_queue);
1387 }
1388
1389 static void
1390 g_raid3_flush(struct g_raid3_softc *sc, struct bio *bp)
1391 {
1392         struct bio_queue_head queue;
1393         struct g_raid3_disk *disk;
1394         struct g_consumer *cp;
1395         struct bio *cbp;
1396         u_int i;
1397
1398         bioq_init(&queue);
1399         for (i = 0; i < sc->sc_ndisks; i++) {
1400                 disk = &sc->sc_disks[i];
1401                 if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
1402                         continue;
1403                 cbp = g_clone_bio(bp);
1404                 if (cbp == NULL) {
1405                         for (cbp = bioq_first(&queue); cbp != NULL;
1406                             cbp = bioq_first(&queue)) {
1407                                 bioq_remove(&queue, cbp);
1408                                 g_destroy_bio(cbp);
1409                         }
1410                         if (bp->bio_error == 0)
1411                                 bp->bio_error = ENOMEM;
1412                         g_io_deliver(bp, bp->bio_error);
1413                         return;
1414                 }
1415                 bioq_insert_tail(&queue, cbp);
1416                 cbp->bio_done = g_std_done;
1417                 cbp->bio_caller1 = disk;
1418                 cbp->bio_to = disk->d_consumer->provider;
1419         }
1420         for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) {
1421                 bioq_remove(&queue, cbp);
1422                 G_RAID3_LOGREQ(3, cbp, "Sending request.");
1423                 disk = cbp->bio_caller1;
1424                 cbp->bio_caller1 = NULL;
1425                 cp = disk->d_consumer;
1426                 KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1427                     ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1428                     cp->acr, cp->acw, cp->ace));
1429                 g_io_request(cbp, disk->d_consumer);
1430         }
1431 }
1432
1433 static void
1434 g_raid3_start(struct bio *bp)
1435 {
1436         struct g_raid3_softc *sc;
1437
1438         sc = bp->bio_to->geom->softc;
1439         /*
1440          * If sc == NULL or there are no valid disks, provider's error
1441          * should be set and g_raid3_start() should not be called at all.
1442          */
1443         KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
1444             sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE),
1445             ("Provider's error should be set (error=%d)(device=%s).",
1446             bp->bio_to->error, bp->bio_to->name));
1447         G_RAID3_LOGREQ(3, bp, "Request received.");
1448
1449         switch (bp->bio_cmd) {
1450         case BIO_READ:
1451         case BIO_WRITE:
1452         case BIO_DELETE:
1453                 break;
1454         case BIO_FLUSH:
1455                 g_raid3_flush(sc, bp);
1456                 return;
1457         case BIO_GETATTR:
1458         default:
1459                 g_io_deliver(bp, EOPNOTSUPP);
1460                 return;
1461         }
1462         mtx_lock(&sc->sc_queue_mtx);
1463         bioq_insert_tail(&sc->sc_queue, bp);
1464         mtx_unlock(&sc->sc_queue_mtx);
1465         G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
1466         wakeup(sc);
1467 }
1468
1469 /*
1470  * Return TRUE if the given request is colliding with a in-progress
1471  * synchronization request.
1472  */
1473 static int
1474 g_raid3_sync_collision(struct g_raid3_softc *sc, struct bio *bp)
1475 {
1476         struct g_raid3_disk *disk;
1477         struct bio *sbp;
1478         off_t rstart, rend, sstart, send;
1479         int i;
1480
1481         disk = sc->sc_syncdisk;
1482         if (disk == NULL)
1483                 return (0);
1484         rstart = bp->bio_offset;
1485         rend = bp->bio_offset + bp->bio_length;
1486         for (i = 0; i < g_raid3_syncreqs; i++) {
1487                 sbp = disk->d_sync.ds_bios[i];
1488                 if (sbp == NULL)
1489                         continue;
1490                 sstart = sbp->bio_offset;
1491                 send = sbp->bio_length;
1492                 if (sbp->bio_cmd == BIO_WRITE) {
1493                         sstart *= sc->sc_ndisks - 1;
1494                         send *= sc->sc_ndisks - 1;
1495                 }
1496                 send += sstart;
1497                 if (rend > sstart && rstart < send)
1498                         return (1);
1499         }
1500         return (0);
1501 }
1502
1503 /*
1504  * Return TRUE if the given sync request is colliding with a in-progress regular
1505  * request.
1506  */
1507 static int
1508 g_raid3_regular_collision(struct g_raid3_softc *sc, struct bio *sbp)
1509 {
1510         off_t rstart, rend, sstart, send;
1511         struct bio *bp;
1512
1513         if (sc->sc_syncdisk == NULL)
1514                 return (0);
1515         sstart = sbp->bio_offset;
1516         send = sstart + sbp->bio_length;
1517         TAILQ_FOREACH(bp, &sc->sc_inflight.queue, bio_queue) {
1518                 rstart = bp->bio_offset;
1519                 rend = bp->bio_offset + bp->bio_length;
1520                 if (rend > sstart && rstart < send)
1521                         return (1);
1522         }
1523         return (0);
1524 }
1525
1526 /*
1527  * Puts request onto delayed queue.
1528  */
1529 static void
1530 g_raid3_regular_delay(struct g_raid3_softc *sc, struct bio *bp)
1531 {
1532
1533         G_RAID3_LOGREQ(2, bp, "Delaying request.");
1534         bioq_insert_head(&sc->sc_regular_delayed, bp);
1535 }
1536
1537 /*
1538  * Puts synchronization request onto delayed queue.
1539  */
1540 static void
1541 g_raid3_sync_delay(struct g_raid3_softc *sc, struct bio *bp)
1542 {
1543
1544         G_RAID3_LOGREQ(2, bp, "Delaying synchronization request.");
1545         bioq_insert_tail(&sc->sc_sync_delayed, bp);
1546 }
1547
1548 /*
1549  * Releases delayed regular requests which don't collide anymore with sync
1550  * requests.
1551  */
1552 static void
1553 g_raid3_regular_release(struct g_raid3_softc *sc)
1554 {
1555         struct bio *bp, *bp2;
1556
1557         TAILQ_FOREACH_SAFE(bp, &sc->sc_regular_delayed.queue, bio_queue, bp2) {
1558                 if (g_raid3_sync_collision(sc, bp))
1559                         continue;
1560                 bioq_remove(&sc->sc_regular_delayed, bp);
1561                 G_RAID3_LOGREQ(2, bp, "Releasing delayed request (%p).", bp);
1562                 mtx_lock(&sc->sc_queue_mtx);
1563                 bioq_insert_head(&sc->sc_queue, bp);
1564 #if 0
1565                 /*
1566                  * wakeup() is not needed, because this function is called from
1567                  * the worker thread.
1568                  */
1569                 wakeup(&sc->sc_queue);
1570 #endif
1571                 mtx_unlock(&sc->sc_queue_mtx);
1572         }
1573 }
1574
1575 /*
1576  * Releases delayed sync requests which don't collide anymore with regular
1577  * requests.
1578  */
1579 static void
1580 g_raid3_sync_release(struct g_raid3_softc *sc)
1581 {
1582         struct bio *bp, *bp2;
1583
1584         TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed.queue, bio_queue, bp2) {
1585                 if (g_raid3_regular_collision(sc, bp))
1586                         continue;
1587                 bioq_remove(&sc->sc_sync_delayed, bp);
1588                 G_RAID3_LOGREQ(2, bp,
1589                     "Releasing delayed synchronization request.");
1590                 g_io_request(bp, bp->bio_from);
1591         }
1592 }
1593
1594 /*
1595  * Handle synchronization requests.
1596  * Every synchronization request is two-steps process: first, READ request is
1597  * send to active provider and then WRITE request (with read data) to the provider
1598  * beeing synchronized. When WRITE is finished, new synchronization request is
1599  * send.
1600  */
1601 static void
1602 g_raid3_sync_request(struct bio *bp)
1603 {
1604         struct g_raid3_softc *sc;
1605         struct g_raid3_disk *disk;
1606
1607         bp->bio_from->index--;
1608         sc = bp->bio_from->geom->softc;
1609         disk = bp->bio_from->private;
1610         if (disk == NULL) {
1611                 sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */
1612                 g_topology_lock();
1613                 g_raid3_kill_consumer(sc, bp->bio_from);
1614                 g_topology_unlock();
1615                 free(bp->bio_data, M_RAID3);
1616                 g_destroy_bio(bp);
1617                 sx_xlock(&sc->sc_lock);
1618                 return;
1619         }
1620
1621         /*
1622          * Synchronization request.
1623          */
1624         switch (bp->bio_cmd) {
1625         case BIO_READ:
1626             {
1627                 struct g_consumer *cp;
1628                 u_char *dst, *src;
1629                 off_t left;
1630                 u_int atom;
1631
1632                 if (bp->bio_error != 0) {
1633                         G_RAID3_LOGREQ(0, bp,
1634                             "Synchronization request failed (error=%d).",
1635                             bp->bio_error);
1636                         g_destroy_bio(bp);
1637                         return;
1638                 }
1639                 G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
1640                 atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1641                 dst = src = bp->bio_data;
1642                 if (disk->d_no == sc->sc_ndisks - 1) {
1643                         u_int n;
1644
1645                         /* Parity component. */
1646                         for (left = bp->bio_length; left > 0;
1647                             left -= sc->sc_sectorsize) {
1648                                 bcopy(src, dst, atom);
1649                                 src += atom;
1650                                 for (n = 1; n < sc->sc_ndisks - 1; n++) {
1651                                         g_raid3_xor(src, dst, atom);
1652                                         src += atom;
1653                                 }
1654                                 dst += atom;
1655                         }
1656                 } else {
1657                         /* Regular component. */
1658                         src += atom * disk->d_no;
1659                         for (left = bp->bio_length; left > 0;
1660                             left -= sc->sc_sectorsize) {
1661                                 bcopy(src, dst, atom);
1662                                 src += sc->sc_sectorsize;
1663                                 dst += atom;
1664                         }
1665                 }
1666                 bp->bio_driver1 = bp->bio_driver2 = NULL;
1667                 bp->bio_pflags = 0;
1668                 bp->bio_offset /= sc->sc_ndisks - 1;
1669                 bp->bio_length /= sc->sc_ndisks - 1;
1670                 bp->bio_cmd = BIO_WRITE;
1671                 bp->bio_cflags = 0;
1672                 bp->bio_children = bp->bio_inbed = 0;
1673                 cp = disk->d_consumer;
1674                 KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1675                     ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1676                     cp->acr, cp->acw, cp->ace));
1677                 cp->index++;
1678                 g_io_request(bp, cp);
1679                 return;
1680             }
1681         case BIO_WRITE:
1682             {
1683                 struct g_raid3_disk_sync *sync;
1684                 off_t boffset, moffset;
1685                 void *data;
1686                 int i;
1687
1688                 if (bp->bio_error != 0) {
1689                         G_RAID3_LOGREQ(0, bp,
1690                             "Synchronization request failed (error=%d).",
1691                             bp->bio_error);
1692                         g_destroy_bio(bp);
1693                         sc->sc_bump_id |= G_RAID3_BUMP_GENID;
1694                         g_raid3_event_send(disk,
1695                             G_RAID3_DISK_STATE_DISCONNECTED,
1696                             G_RAID3_EVENT_DONTWAIT);
1697                         return;
1698                 }
1699                 G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
1700                 sync = &disk->d_sync;
1701                 if (sync->ds_offset == sc->sc_mediasize / (sc->sc_ndisks - 1) ||
1702                     sync->ds_consumer == NULL ||
1703                     (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1704                         /* Don't send more synchronization requests. */
1705                         sync->ds_inflight--;
1706                         if (sync->ds_bios != NULL) {
1707                                 i = (int)(uintptr_t)bp->bio_caller1;
1708                                 sync->ds_bios[i] = NULL;
1709                         }
1710                         free(bp->bio_data, M_RAID3);
1711                         g_destroy_bio(bp);
1712                         if (sync->ds_inflight > 0)
1713                                 return;
1714                         if (sync->ds_consumer == NULL ||
1715                             (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1716                                 return;
1717                         }
1718                         /*
1719                          * Disk up-to-date, activate it.
1720                          */
1721                         g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE,
1722                             G_RAID3_EVENT_DONTWAIT);
1723                         return;
1724                 }
1725
1726                 /* Send next synchronization request. */
1727                 data = bp->bio_data;
1728                 bzero(bp, sizeof(*bp));
1729                 bp->bio_cmd = BIO_READ;
1730                 bp->bio_offset = sync->ds_offset * (sc->sc_ndisks - 1);
1731                 bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
1732                 sync->ds_offset += bp->bio_length / (sc->sc_ndisks - 1);
1733                 bp->bio_done = g_raid3_sync_done;
1734                 bp->bio_data = data;
1735                 bp->bio_from = sync->ds_consumer;
1736                 bp->bio_to = sc->sc_provider;
1737                 G_RAID3_LOGREQ(3, bp, "Sending synchronization request.");
1738                 sync->ds_consumer->index++;
1739                 /*
1740                  * Delay the request if it is colliding with a regular request.
1741                  */
1742                 if (g_raid3_regular_collision(sc, bp))
1743                         g_raid3_sync_delay(sc, bp);
1744                 else
1745                         g_io_request(bp, sync->ds_consumer);
1746
1747                 /* Release delayed requests if possible. */
1748                 g_raid3_regular_release(sc);
1749
1750                 /* Find the smallest offset. */
1751                 moffset = sc->sc_mediasize;
1752                 for (i = 0; i < g_raid3_syncreqs; i++) {
1753                         bp = sync->ds_bios[i];
1754                         boffset = bp->bio_offset;
1755                         if (bp->bio_cmd == BIO_WRITE)
1756                                 boffset *= sc->sc_ndisks - 1;
1757                         if (boffset < moffset)
1758                                 moffset = boffset;
1759                 }
1760                 if (sync->ds_offset_done + (MAXPHYS * 100) < moffset) {
1761                         /* Update offset_done on every 100 blocks. */
1762                         sync->ds_offset_done = moffset;
1763                         g_raid3_update_metadata(disk);
1764                 }
1765                 return;
1766             }
1767         default:
1768                 KASSERT(1 == 0, ("Invalid command here: %u (device=%s)",
1769                     bp->bio_cmd, sc->sc_name));
1770                 break;
1771         }
1772 }
1773
1774 static int
1775 g_raid3_register_request(struct bio *pbp)
1776 {
1777         struct g_raid3_softc *sc;
1778         struct g_raid3_disk *disk;
1779         struct g_consumer *cp;
1780         struct bio *cbp, *tmpbp;
1781         off_t offset, length;
1782         u_int n, ndisks;
1783         int round_robin, verify;
1784
1785         ndisks = 0;
1786         sc = pbp->bio_to->geom->softc;
1787         if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 &&
1788             sc->sc_syncdisk == NULL) {
1789                 g_io_deliver(pbp, EIO);
1790                 return (0);
1791         }
1792         g_raid3_init_bio(pbp);
1793         length = pbp->bio_length / (sc->sc_ndisks - 1);
1794         offset = pbp->bio_offset / (sc->sc_ndisks - 1);
1795         round_robin = verify = 0;
1796         switch (pbp->bio_cmd) {
1797         case BIO_READ:
1798                 if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
1799                     sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1800                         pbp->bio_pflags |= G_RAID3_BIO_PFLAG_VERIFY;
1801                         verify = 1;
1802                         ndisks = sc->sc_ndisks;
1803                 } else {
1804                         verify = 0;
1805                         ndisks = sc->sc_ndisks - 1;
1806                 }
1807                 if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0 &&
1808                     sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1809                         round_robin = 1;
1810                 } else {
1811                         round_robin = 0;
1812                 }
1813                 KASSERT(!round_robin || !verify,
1814                     ("ROUND-ROBIN and VERIFY are mutually exclusive."));
1815                 pbp->bio_driver2 = &sc->sc_disks[sc->sc_ndisks - 1];
1816                 break;
1817         case BIO_WRITE:
1818         case BIO_DELETE:
1819                 /*
1820                  * Delay the request if it is colliding with a synchronization
1821                  * request.
1822                  */
1823                 if (g_raid3_sync_collision(sc, pbp)) {
1824                         g_raid3_regular_delay(sc, pbp);
1825                         return (0);
1826                 }
1827
1828                 if (sc->sc_idle)
1829                         g_raid3_unidle(sc);
1830                 else
1831                         sc->sc_last_write = time_uptime;
1832
1833                 ndisks = sc->sc_ndisks;
1834                 break;
1835         }
1836         for (n = 0; n < ndisks; n++) {
1837                 disk = &sc->sc_disks[n];
1838                 cbp = g_raid3_clone_bio(sc, pbp);
1839                 if (cbp == NULL) {
1840                         while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
1841                                 g_raid3_destroy_bio(sc, cbp);
1842                         /*
1843                          * To prevent deadlock, we must run back up
1844                          * with the ENOMEM for failed requests of any
1845                          * of our consumers.  Our own sync requests
1846                          * can stick around, as they are finite.
1847                          */
1848                         if ((pbp->bio_cflags &
1849                             G_RAID3_BIO_CFLAG_REGULAR) != 0) {
1850                                 g_io_deliver(pbp, ENOMEM);
1851                                 return (0);
1852                         }
1853                         return (ENOMEM);
1854                 }
1855                 cbp->bio_offset = offset;
1856                 cbp->bio_length = length;
1857                 cbp->bio_done = g_raid3_done;
1858                 switch (pbp->bio_cmd) {
1859                 case BIO_READ:
1860                         if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
1861                                 /*
1862                                  * Replace invalid component with the parity
1863                                  * component.
1864                                  */
1865                                 disk = &sc->sc_disks[sc->sc_ndisks - 1];
1866                                 cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1867                                 pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1868                         } else if (round_robin &&
1869                             disk->d_no == sc->sc_round_robin) {
1870                                 /*
1871                                  * In round-robin mode skip one data component
1872                                  * and use parity component when reading.
1873                                  */
1874                                 pbp->bio_driver2 = disk;
1875                                 disk = &sc->sc_disks[sc->sc_ndisks - 1];
1876                                 cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1877                                 sc->sc_round_robin++;
1878                                 round_robin = 0;
1879                         } else if (verify && disk->d_no == sc->sc_ndisks - 1) {
1880                                 cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1881                         }
1882                         break;
1883                 case BIO_WRITE:
1884                 case BIO_DELETE:
1885                         if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
1886                             disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
1887                                 if (n == ndisks - 1) {
1888                                         /*
1889                                          * Active parity component, mark it as such.
1890                                          */
1891                                         cbp->bio_cflags |=
1892                                             G_RAID3_BIO_CFLAG_PARITY;
1893                                 }
1894                         } else {
1895                                 pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1896                                 if (n == ndisks - 1) {
1897                                         /*
1898                                          * Parity component is not connected,
1899                                          * so destroy its request.
1900                                          */
1901                                         pbp->bio_pflags |=
1902                                             G_RAID3_BIO_PFLAG_NOPARITY;
1903                                         g_raid3_destroy_bio(sc, cbp);
1904                                         cbp = NULL;
1905                                 } else {
1906                                         cbp->bio_cflags |=
1907                                             G_RAID3_BIO_CFLAG_NODISK;
1908                                         disk = NULL;
1909                                 }
1910                         }
1911                         break;
1912                 }
1913                 if (cbp != NULL)
1914                         cbp->bio_caller2 = disk;
1915         }
1916         switch (pbp->bio_cmd) {
1917         case BIO_READ:
1918                 if (round_robin) {
1919                         /*
1920                          * If we are in round-robin mode and 'round_robin' is
1921                          * still 1, it means, that we skipped parity component
1922                          * for this read and must reset sc_round_robin field.
1923                          */
1924                         sc->sc_round_robin = 0;
1925                 }
1926                 G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
1927                         disk = cbp->bio_caller2;
1928                         cp = disk->d_consumer;
1929                         cbp->bio_to = cp->provider;
1930                         G_RAID3_LOGREQ(3, cbp, "Sending request.");
1931                         KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1932                             ("Consumer %s not opened (r%dw%de%d).",
1933                             cp->provider->name, cp->acr, cp->acw, cp->ace));
1934                         cp->index++;
1935                         g_io_request(cbp, cp);
1936                 }
1937                 break;
1938         case BIO_WRITE:
1939         case BIO_DELETE:
1940                 /*
1941                  * Put request onto inflight queue, so we can check if new
1942                  * synchronization requests don't collide with it.
1943                  */
1944                 bioq_insert_tail(&sc->sc_inflight, pbp);
1945
1946                 /*
1947                  * Bump syncid on first write.
1948                  */
1949                 if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) {
1950                         sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
1951                         g_raid3_bump_syncid(sc);
1952                 }
1953                 g_raid3_scatter(pbp);
1954                 break;
1955         }
1956         return (0);
1957 }
1958
1959 static int
1960 g_raid3_can_destroy(struct g_raid3_softc *sc)
1961 {
1962         struct g_geom *gp;
1963         struct g_consumer *cp;
1964
1965         g_topology_assert();
1966         gp = sc->sc_geom;
1967         if (gp->softc == NULL)
1968                 return (1);
1969         LIST_FOREACH(cp, &gp->consumer, consumer) {
1970                 if (g_raid3_is_busy(sc, cp))
1971                         return (0);
1972         }
1973         gp = sc->sc_sync.ds_geom;
1974         LIST_FOREACH(cp, &gp->consumer, consumer) {
1975                 if (g_raid3_is_busy(sc, cp))
1976                         return (0);
1977         }
1978         G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.",
1979             sc->sc_name);
1980         return (1);
1981 }
1982
1983 static int
1984 g_raid3_try_destroy(struct g_raid3_softc *sc)
1985 {
1986
1987         g_topology_assert_not();
1988         sx_assert(&sc->sc_lock, SX_XLOCKED);
1989
1990         if (sc->sc_rootmount != NULL) {
1991                 G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
1992                     sc->sc_rootmount);
1993                 root_mount_rel(sc->sc_rootmount);
1994                 sc->sc_rootmount = NULL;
1995         }
1996
1997         g_topology_lock();
1998         if (!g_raid3_can_destroy(sc)) {
1999                 g_topology_unlock();
2000                 return (0);
2001         }
2002         sc->sc_geom->softc = NULL;
2003         sc->sc_sync.ds_geom->softc = NULL;
2004         if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) {
2005                 g_topology_unlock();
2006                 G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
2007                     &sc->sc_worker);
2008                 /* Unlock sc_lock here, as it can be destroyed after wakeup. */
2009                 sx_xunlock(&sc->sc_lock);
2010                 wakeup(&sc->sc_worker);
2011                 sc->sc_worker = NULL;
2012         } else {
2013                 g_topology_unlock();
2014                 g_raid3_destroy_device(sc);
2015                 free(sc->sc_disks, M_RAID3);
2016                 free(sc, M_RAID3);
2017         }
2018         return (1);
2019 }
2020
2021 /*
2022  * Worker thread.
2023  */
2024 static void
2025 g_raid3_worker(void *arg)
2026 {
2027         struct g_raid3_softc *sc;
2028         struct g_raid3_event *ep;
2029         struct bio *bp;
2030         int timeout;
2031
2032         sc = arg;
2033         thread_lock(curthread);
2034         sched_prio(curthread, PRIBIO);
2035         thread_unlock(curthread);
2036
2037         sx_xlock(&sc->sc_lock);
2038         for (;;) {
2039                 G_RAID3_DEBUG(5, "%s: Let's see...", __func__);
2040                 /*
2041                  * First take a look at events.
2042                  * This is important to handle events before any I/O requests.
2043                  */
2044                 ep = g_raid3_event_get(sc);
2045                 if (ep != NULL) {
2046                         g_raid3_event_remove(sc, ep);
2047                         if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) {
2048                                 /* Update only device status. */
2049                                 G_RAID3_DEBUG(3,
2050                                     "Running event for device %s.",
2051                                     sc->sc_name);
2052                                 ep->e_error = 0;
2053                                 g_raid3_update_device(sc, 1);
2054                         } else {
2055                                 /* Update disk status. */
2056                                 G_RAID3_DEBUG(3, "Running event for disk %s.",
2057                                      g_raid3_get_diskname(ep->e_disk));
2058                                 ep->e_error = g_raid3_update_disk(ep->e_disk,
2059                                     ep->e_state);
2060                                 if (ep->e_error == 0)
2061                                         g_raid3_update_device(sc, 0);
2062                         }
2063                         if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) {
2064                                 KASSERT(ep->e_error == 0,
2065                                     ("Error cannot be handled."));
2066                                 g_raid3_event_free(ep);
2067                         } else {
2068                                 ep->e_flags |= G_RAID3_EVENT_DONE;
2069                                 G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
2070                                     ep);
2071                                 mtx_lock(&sc->sc_events_mtx);
2072                                 wakeup(ep);
2073                                 mtx_unlock(&sc->sc_events_mtx);
2074                         }
2075                         if ((sc->sc_flags &
2076                             G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
2077                                 if (g_raid3_try_destroy(sc)) {
2078                                         curthread->td_pflags &= ~TDP_GEOM;
2079                                         G_RAID3_DEBUG(1, "Thread exiting.");
2080                                         kproc_exit(0);
2081                                 }
2082                         }
2083                         G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__);
2084                         continue;
2085                 }
2086                 /*
2087                  * Check if we can mark array as CLEAN and if we can't take
2088                  * how much seconds should we wait.
2089                  */
2090                 timeout = g_raid3_idle(sc, -1);
2091                 /*
2092                  * Now I/O requests.
2093                  */
2094                 /* Get first request from the queue. */
2095                 mtx_lock(&sc->sc_queue_mtx);
2096                 bp = bioq_first(&sc->sc_queue);
2097                 if (bp == NULL) {
2098                         if ((sc->sc_flags &
2099                             G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
2100                                 mtx_unlock(&sc->sc_queue_mtx);
2101                                 if (g_raid3_try_destroy(sc)) {
2102                                         curthread->td_pflags &= ~TDP_GEOM;
2103                                         G_RAID3_DEBUG(1, "Thread exiting.");
2104                                         kproc_exit(0);
2105                                 }
2106                                 mtx_lock(&sc->sc_queue_mtx);
2107                         }
2108                         sx_xunlock(&sc->sc_lock);
2109                         /*
2110                          * XXX: We can miss an event here, because an event
2111                          *      can be added without sx-device-lock and without
2112                          *      mtx-queue-lock. Maybe I should just stop using
2113                          *      dedicated mutex for events synchronization and
2114                          *      stick with the queue lock?
2115                          *      The event will hang here until next I/O request
2116                          *      or next event is received.
2117                          */
2118                         MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w1",
2119                             timeout * hz);
2120                         sx_xlock(&sc->sc_lock);
2121                         G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__);
2122                         continue;
2123                 }
2124 process:
2125                 bioq_remove(&sc->sc_queue, bp);
2126                 mtx_unlock(&sc->sc_queue_mtx);
2127
2128                 if (bp->bio_from->geom == sc->sc_sync.ds_geom &&
2129                     (bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) {
2130                         g_raid3_sync_request(bp);       /* READ */
2131                 } else if (bp->bio_to != sc->sc_provider) {
2132                         if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0)
2133                                 g_raid3_regular_request(bp);
2134                         else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0)
2135                                 g_raid3_sync_request(bp);       /* WRITE */
2136                         else {
2137                                 KASSERT(0,
2138                                     ("Invalid request cflags=0x%hhx to=%s.",
2139                                     bp->bio_cflags, bp->bio_to->name));
2140                         }
2141                 } else if (g_raid3_register_request(bp) != 0) {
2142                         mtx_lock(&sc->sc_queue_mtx);
2143                         bioq_insert_head(&sc->sc_queue, bp);
2144                         /*
2145                          * We are short in memory, let see if there are finished
2146                          * request we can free.
2147                          */
2148                         TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
2149                                 if (bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR)
2150                                         goto process;
2151                         }
2152                         /*
2153                          * No finished regular request, so at least keep
2154                          * synchronization running.
2155                          */
2156                         TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
2157                                 if (bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC)
2158                                         goto process;
2159                         }
2160                         sx_xunlock(&sc->sc_lock);
2161                         MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx, PRIBIO | PDROP,
2162                             "r3:lowmem", hz / 10);
2163                         sx_xlock(&sc->sc_lock);
2164                 }
2165                 G_RAID3_DEBUG(5, "%s: I'm here 9.", __func__);
2166         }
2167 }
2168
2169 static void
2170 g_raid3_update_idle(struct g_raid3_softc *sc, struct g_raid3_disk *disk)
2171 {
2172
2173         sx_assert(&sc->sc_lock, SX_LOCKED);
2174         if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0)
2175                 return;
2176         if (!sc->sc_idle && (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) {
2177                 G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
2178                     g_raid3_get_diskname(disk), sc->sc_name);
2179                 disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
2180         } else if (sc->sc_idle &&
2181             (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) {
2182                 G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
2183                     g_raid3_get_diskname(disk), sc->sc_name);
2184                 disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2185         }
2186 }
2187
2188 static void
2189 g_raid3_sync_start(struct g_raid3_softc *sc)
2190 {
2191         struct g_raid3_disk *disk;
2192         struct g_consumer *cp;
2193         struct bio *bp;
2194         int error;
2195         u_int n;
2196
2197         g_topology_assert_not();
2198         sx_assert(&sc->sc_lock, SX_XLOCKED);
2199
2200         KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
2201             ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
2202             sc->sc_state));
2203         KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).",
2204             sc->sc_name, sc->sc_state));
2205         disk = NULL;
2206         for (n = 0; n < sc->sc_ndisks; n++) {
2207                 if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING)
2208                         continue;
2209                 disk = &sc->sc_disks[n];
2210                 break;
2211         }
2212         if (disk == NULL)
2213                 return;
2214
2215         sx_xunlock(&sc->sc_lock);
2216         g_topology_lock();
2217         cp = g_new_consumer(sc->sc_sync.ds_geom);
2218         error = g_attach(cp, sc->sc_provider);
2219         KASSERT(error == 0,
2220             ("Cannot attach to %s (error=%d).", sc->sc_name, error));
2221         error = g_access(cp, 1, 0, 0);
2222         KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error));
2223         g_topology_unlock();
2224         sx_xlock(&sc->sc_lock);
2225
2226         G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name,
2227             g_raid3_get_diskname(disk));
2228         if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) == 0)
2229                 disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
2230         KASSERT(disk->d_sync.ds_consumer == NULL,
2231             ("Sync consumer already exists (device=%s, disk=%s).",
2232             sc->sc_name, g_raid3_get_diskname(disk)));
2233
2234         disk->d_sync.ds_consumer = cp;
2235         disk->d_sync.ds_consumer->private = disk;
2236         disk->d_sync.ds_consumer->index = 0;
2237         sc->sc_syncdisk = disk;
2238
2239         /*
2240          * Allocate memory for synchronization bios and initialize them.
2241          */
2242         disk->d_sync.ds_bios = malloc(sizeof(struct bio *) * g_raid3_syncreqs,
2243             M_RAID3, M_WAITOK);
2244         for (n = 0; n < g_raid3_syncreqs; n++) {
2245                 bp = g_alloc_bio();
2246                 disk->d_sync.ds_bios[n] = bp;
2247                 bp->bio_parent = NULL;
2248                 bp->bio_cmd = BIO_READ;
2249                 bp->bio_data = malloc(MAXPHYS, M_RAID3, M_WAITOK);
2250                 bp->bio_cflags = 0;
2251                 bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1);
2252                 bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
2253                 disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1);
2254                 bp->bio_done = g_raid3_sync_done;
2255                 bp->bio_from = disk->d_sync.ds_consumer;
2256                 bp->bio_to = sc->sc_provider;
2257                 bp->bio_caller1 = (void *)(uintptr_t)n;
2258         }
2259
2260         /* Set the number of in-flight synchronization requests. */
2261         disk->d_sync.ds_inflight = g_raid3_syncreqs;
2262
2263         /*
2264          * Fire off first synchronization requests.
2265          */
2266         for (n = 0; n < g_raid3_syncreqs; n++) {
2267                 bp = disk->d_sync.ds_bios[n];
2268                 G_RAID3_LOGREQ(3, bp, "Sending synchronization request.");
2269                 disk->d_sync.ds_consumer->index++;
2270                 /*
2271                  * Delay the request if it is colliding with a regular request.
2272                  */
2273                 if (g_raid3_regular_collision(sc, bp))
2274                         g_raid3_sync_delay(sc, bp);
2275                 else
2276                         g_io_request(bp, disk->d_sync.ds_consumer);
2277         }
2278 }
2279
2280 /*
2281  * Stop synchronization process.
2282  * type: 0 - synchronization finished
2283  *       1 - synchronization stopped
2284  */
2285 static void
2286 g_raid3_sync_stop(struct g_raid3_softc *sc, int type)
2287 {
2288         struct g_raid3_disk *disk;
2289         struct g_consumer *cp;
2290
2291         g_topology_assert_not();
2292         sx_assert(&sc->sc_lock, SX_LOCKED);
2293
2294         KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
2295             ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
2296             sc->sc_state));
2297         disk = sc->sc_syncdisk;
2298         sc->sc_syncdisk = NULL;
2299         KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name));
2300         KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2301             ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2302             g_raid3_disk_state2str(disk->d_state)));
2303         if (disk->d_sync.ds_consumer == NULL)
2304                 return;
2305
2306         if (type == 0) {
2307                 G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.",
2308                     sc->sc_name, g_raid3_get_diskname(disk));
2309         } else /* if (type == 1) */ {
2310                 G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.",
2311                     sc->sc_name, g_raid3_get_diskname(disk));
2312         }
2313         free(disk->d_sync.ds_bios, M_RAID3);
2314         disk->d_sync.ds_bios = NULL;
2315         cp = disk->d_sync.ds_consumer;
2316         disk->d_sync.ds_consumer = NULL;
2317         disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2318         sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */
2319         g_topology_lock();
2320         g_raid3_kill_consumer(sc, cp);
2321         g_topology_unlock();
2322         sx_xlock(&sc->sc_lock);
2323 }
2324
2325 static void
2326 g_raid3_launch_provider(struct g_raid3_softc *sc)
2327 {
2328         struct g_provider *pp;
2329         struct g_raid3_disk *disk;
2330         int n;
2331
2332         sx_assert(&sc->sc_lock, SX_LOCKED);
2333
2334         g_topology_lock();
2335         pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name);
2336         pp->mediasize = sc->sc_mediasize;
2337         pp->sectorsize = sc->sc_sectorsize;
2338         pp->stripesize = 0;
2339         pp->stripeoffset = 0;
2340         for (n = 0; n < sc->sc_ndisks; n++) {
2341                 disk = &sc->sc_disks[n];
2342                 if (disk->d_consumer && disk->d_consumer->provider &&
2343                     disk->d_consumer->provider->stripesize > pp->stripesize) {
2344                         pp->stripesize = disk->d_consumer->provider->stripesize;
2345                         pp->stripeoffset = disk->d_consumer->provider->stripeoffset;
2346                 }
2347         }
2348         pp->stripesize *= sc->sc_ndisks - 1;
2349         pp->stripeoffset *= sc->sc_ndisks - 1;
2350         sc->sc_provider = pp;
2351         g_error_provider(pp, 0);
2352         g_topology_unlock();
2353         G_RAID3_DEBUG(0, "Device %s launched (%u/%u).", pp->name,
2354             g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE), sc->sc_ndisks);
2355
2356         if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED)
2357                 g_raid3_sync_start(sc);
2358 }
2359
2360 static void
2361 g_raid3_destroy_provider(struct g_raid3_softc *sc)
2362 {
2363         struct bio *bp;
2364
2365         g_topology_assert_not();
2366         KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).",
2367             sc->sc_name));
2368
2369         g_topology_lock();
2370         g_error_provider(sc->sc_provider, ENXIO);
2371         mtx_lock(&sc->sc_queue_mtx);
2372         while ((bp = bioq_first(&sc->sc_queue)) != NULL) {
2373                 bioq_remove(&sc->sc_queue, bp);
2374                 g_io_deliver(bp, ENXIO);
2375         }
2376         mtx_unlock(&sc->sc_queue_mtx);
2377         G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name,
2378             sc->sc_provider->name);
2379         sc->sc_provider->flags |= G_PF_WITHER;
2380         g_orphan_provider(sc->sc_provider, ENXIO);
2381         g_topology_unlock();
2382         sc->sc_provider = NULL;
2383         if (sc->sc_syncdisk != NULL)
2384                 g_raid3_sync_stop(sc, 1);
2385 }
2386
2387 static void
2388 g_raid3_go(void *arg)
2389 {
2390         struct g_raid3_softc *sc;
2391
2392         sc = arg;
2393         G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name);
2394         g_raid3_event_send(sc, 0,
2395             G_RAID3_EVENT_DONTWAIT | G_RAID3_EVENT_DEVICE);
2396 }
2397
2398 static u_int
2399 g_raid3_determine_state(struct g_raid3_disk *disk)
2400 {
2401         struct g_raid3_softc *sc;
2402         u_int state;
2403
2404         sc = disk->d_softc;
2405         if (sc->sc_syncid == disk->d_sync.ds_syncid) {
2406                 if ((disk->d_flags &
2407                     G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) {
2408                         /* Disk does not need synchronization. */
2409                         state = G_RAID3_DISK_STATE_ACTIVE;
2410                 } else {
2411                         if ((sc->sc_flags &
2412                              G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
2413                             (disk->d_flags &
2414                              G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
2415                                 /*
2416                                  * We can start synchronization from
2417                                  * the stored offset.
2418                                  */
2419                                 state = G_RAID3_DISK_STATE_SYNCHRONIZING;
2420                         } else {
2421                                 state = G_RAID3_DISK_STATE_STALE;
2422                         }
2423                 }
2424         } else if (disk->d_sync.ds_syncid < sc->sc_syncid) {
2425                 /*
2426                  * Reset all synchronization data for this disk,
2427                  * because if it even was synchronized, it was
2428                  * synchronized to disks with different syncid.
2429                  */
2430                 disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
2431                 disk->d_sync.ds_offset = 0;
2432                 disk->d_sync.ds_offset_done = 0;
2433                 disk->d_sync.ds_syncid = sc->sc_syncid;
2434                 if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
2435                     (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
2436                         state = G_RAID3_DISK_STATE_SYNCHRONIZING;
2437                 } else {
2438                         state = G_RAID3_DISK_STATE_STALE;
2439                 }
2440         } else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ {
2441                 /*
2442                  * Not good, NOT GOOD!
2443                  * It means that device was started on stale disks
2444                  * and more fresh disk just arrive.
2445                  * If there were writes, device is broken, sorry.
2446                  * I think the best choice here is don't touch
2447                  * this disk and inform the user loudly.
2448                  */
2449                 G_RAID3_DEBUG(0, "Device %s was started before the freshest "
2450                     "disk (%s) arrives!! It will not be connected to the "
2451                     "running device.", sc->sc_name,
2452                     g_raid3_get_diskname(disk));
2453                 g_raid3_destroy_disk(disk);
2454                 state = G_RAID3_DISK_STATE_NONE;
2455                 /* Return immediately, because disk was destroyed. */
2456                 return (state);
2457         }
2458         G_RAID3_DEBUG(3, "State for %s disk: %s.",
2459             g_raid3_get_diskname(disk), g_raid3_disk_state2str(state));
2460         return (state);
2461 }
2462
2463 /*
2464  * Update device state.
2465  */
2466 static void
2467 g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force)
2468 {
2469         struct g_raid3_disk *disk;
2470         u_int state;
2471
2472         sx_assert(&sc->sc_lock, SX_XLOCKED);
2473
2474         switch (sc->sc_state) {
2475         case G_RAID3_DEVICE_STATE_STARTING:
2476             {
2477                 u_int n, ndirty, ndisks, genid, syncid;
2478
2479                 KASSERT(sc->sc_provider == NULL,
2480                     ("Non-NULL provider in STARTING state (%s).", sc->sc_name));
2481                 /*
2482                  * Are we ready? We are, if all disks are connected or
2483                  * one disk is missing and 'force' is true.
2484                  */
2485                 if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) {
2486                         if (!force)
2487                                 callout_drain(&sc->sc_callout);
2488                 } else {
2489                         if (force) {
2490                                 /*
2491                                  * Timeout expired, so destroy device.
2492                                  */
2493                                 sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2494                                 G_RAID3_DEBUG(1, "root_mount_rel[%u] %p",
2495                                     __LINE__, sc->sc_rootmount);
2496                                 root_mount_rel(sc->sc_rootmount);
2497                                 sc->sc_rootmount = NULL;
2498                         }
2499                         return;
2500                 }
2501
2502                 /*
2503                  * Find the biggest genid.
2504                  */
2505                 genid = 0;
2506                 for (n = 0; n < sc->sc_ndisks; n++) {
2507                         disk = &sc->sc_disks[n];
2508                         if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2509                                 continue;
2510                         if (disk->d_genid > genid)
2511                                 genid = disk->d_genid;
2512                 }
2513                 sc->sc_genid = genid;
2514                 /*
2515                  * Remove all disks without the biggest genid.
2516                  */
2517                 for (n = 0; n < sc->sc_ndisks; n++) {
2518                         disk = &sc->sc_disks[n];
2519                         if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2520                                 continue;
2521                         if (disk->d_genid < genid) {
2522                                 G_RAID3_DEBUG(0,
2523                                     "Component %s (device %s) broken, skipping.",
2524                                     g_raid3_get_diskname(disk), sc->sc_name);
2525                                 g_raid3_destroy_disk(disk);
2526                         }
2527                 }
2528
2529                 /*
2530                  * There must be at least 'sc->sc_ndisks - 1' components
2531                  * with the same syncid and without SYNCHRONIZING flag.
2532                  */
2533
2534                 /*
2535                  * Find the biggest syncid, number of valid components and
2536                  * number of dirty components.
2537                  */
2538                 ndirty = ndisks = syncid = 0;
2539                 for (n = 0; n < sc->sc_ndisks; n++) {
2540                         disk = &sc->sc_disks[n];
2541                         if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2542                                 continue;
2543                         if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0)
2544                                 ndirty++;
2545                         if (disk->d_sync.ds_syncid > syncid) {
2546                                 syncid = disk->d_sync.ds_syncid;
2547                                 ndisks = 0;
2548                         } else if (disk->d_sync.ds_syncid < syncid) {
2549                                 continue;
2550                         }
2551                         if ((disk->d_flags &
2552                             G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) {
2553                                 continue;
2554                         }
2555                         ndisks++;
2556                 }
2557                 /*
2558                  * Do we have enough valid components?
2559                  */
2560                 if (ndisks + 1 < sc->sc_ndisks) {
2561                         G_RAID3_DEBUG(0,
2562                             "Device %s is broken, too few valid components.",
2563                             sc->sc_name);
2564                         sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2565                         return;
2566                 }
2567                 /*
2568                  * If there is one DIRTY component and all disks are present,
2569                  * mark it for synchronization. If there is more than one DIRTY
2570                  * component, mark parity component for synchronization.
2571                  */
2572                 if (ndisks == sc->sc_ndisks && ndirty == 1) {
2573                         for (n = 0; n < sc->sc_ndisks; n++) {
2574                                 disk = &sc->sc_disks[n];
2575                                 if ((disk->d_flags &
2576                                     G_RAID3_DISK_FLAG_DIRTY) == 0) {
2577                                         continue;
2578                                 }
2579                                 disk->d_flags |=
2580                                     G_RAID3_DISK_FLAG_SYNCHRONIZING;
2581                         }
2582                 } else if (ndisks == sc->sc_ndisks && ndirty > 1) {
2583                         disk = &sc->sc_disks[sc->sc_ndisks - 1];
2584                         disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
2585                 }
2586
2587                 sc->sc_syncid = syncid;
2588                 if (force) {
2589                         /* Remember to bump syncid on first write. */
2590                         sc->sc_bump_id |= G_RAID3_BUMP_SYNCID;
2591                 }
2592                 if (ndisks == sc->sc_ndisks)
2593                         state = G_RAID3_DEVICE_STATE_COMPLETE;
2594                 else /* if (ndisks == sc->sc_ndisks - 1) */
2595                         state = G_RAID3_DEVICE_STATE_DEGRADED;
2596                 G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.",
2597                     sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2598                     g_raid3_device_state2str(state));
2599                 sc->sc_state = state;
2600                 for (n = 0; n < sc->sc_ndisks; n++) {
2601                         disk = &sc->sc_disks[n];
2602                         if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2603                                 continue;
2604                         state = g_raid3_determine_state(disk);
2605                         g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT);
2606                         if (state == G_RAID3_DISK_STATE_STALE)
2607                                 sc->sc_bump_id |= G_RAID3_BUMP_SYNCID;
2608                 }
2609                 break;
2610             }
2611         case G_RAID3_DEVICE_STATE_DEGRADED:
2612                 /*
2613                  * Genid need to be bumped immediately, so do it here.
2614                  */
2615                 if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) {
2616                         sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
2617                         g_raid3_bump_genid(sc);
2618                 }
2619
2620                 if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
2621                         return;
2622                 if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) <
2623                     sc->sc_ndisks - 1) {
2624                         if (sc->sc_provider != NULL)
2625                                 g_raid3_destroy_provider(sc);
2626                         sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2627                         return;
2628                 }
2629                 if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
2630                     sc->sc_ndisks) {
2631                         state = G_RAID3_DEVICE_STATE_COMPLETE;
2632                         G_RAID3_DEBUG(1,
2633                             "Device %s state changed from %s to %s.",
2634                             sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2635                             g_raid3_device_state2str(state));
2636                         sc->sc_state = state;
2637                 }
2638                 if (sc->sc_provider == NULL)
2639                         g_raid3_launch_provider(sc);
2640                 if (sc->sc_rootmount != NULL) {
2641                         G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
2642                             sc->sc_rootmount);
2643                         root_mount_rel(sc->sc_rootmount);
2644                         sc->sc_rootmount = NULL;
2645                 }
2646                 break;
2647         case G_RAID3_DEVICE_STATE_COMPLETE:
2648                 /*
2649                  * Genid need to be bumped immediately, so do it here.
2650                  */
2651                 if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) {
2652                         sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
2653                         g_raid3_bump_genid(sc);
2654                 }
2655
2656                 if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
2657                         return;
2658                 KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >=
2659                     sc->sc_ndisks - 1,
2660                     ("Too few ACTIVE components in COMPLETE state (device %s).",
2661                     sc->sc_name));
2662                 if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
2663                     sc->sc_ndisks - 1) {
2664                         state = G_RAID3_DEVICE_STATE_DEGRADED;
2665                         G_RAID3_DEBUG(1,
2666                             "Device %s state changed from %s to %s.",
2667                             sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2668                             g_raid3_device_state2str(state));
2669                         sc->sc_state = state;
2670                 }
2671                 if (sc->sc_provider == NULL)
2672                         g_raid3_launch_provider(sc);
2673                 if (sc->sc_rootmount != NULL) {
2674                         G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
2675                             sc->sc_rootmount);
2676                         root_mount_rel(sc->sc_rootmount);
2677                         sc->sc_rootmount = NULL;
2678                 }
2679                 break;
2680         default:
2681                 KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name,
2682                     g_raid3_device_state2str(sc->sc_state)));
2683                 break;
2684         }
2685 }
2686
2687 /*
2688  * Update disk state and device state if needed.
2689  */
2690 #define DISK_STATE_CHANGED()    G_RAID3_DEBUG(1,                        \
2691         "Disk %s state changed from %s to %s (device %s).",             \
2692         g_raid3_get_diskname(disk),                                     \
2693         g_raid3_disk_state2str(disk->d_state),                          \
2694         g_raid3_disk_state2str(state), sc->sc_name)
2695 static int
2696 g_raid3_update_disk(struct g_raid3_disk *disk, u_int state)
2697 {
2698         struct g_raid3_softc *sc;
2699
2700         sc = disk->d_softc;
2701         sx_assert(&sc->sc_lock, SX_XLOCKED);
2702
2703 again:
2704         G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.",
2705             g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state),
2706             g_raid3_disk_state2str(state));
2707         switch (state) {
2708         case G_RAID3_DISK_STATE_NEW:
2709                 /*
2710                  * Possible scenarios:
2711                  * 1. New disk arrive.
2712                  */
2713                 /* Previous state should be NONE. */
2714                 KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE,
2715                     ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2716                     g_raid3_disk_state2str(disk->d_state)));
2717                 DISK_STATE_CHANGED();
2718
2719                 disk->d_state = state;
2720                 G_RAID3_DEBUG(1, "Device %s: provider %s detected.",
2721                     sc->sc_name, g_raid3_get_diskname(disk));
2722                 if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING)
2723                         break;
2724                 KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2725                     sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2726                     ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2727                     g_raid3_device_state2str(sc->sc_state),
2728                     g_raid3_get_diskname(disk),
2729                     g_raid3_disk_state2str(disk->d_state)));
2730                 state = g_raid3_determine_state(disk);
2731                 if (state != G_RAID3_DISK_STATE_NONE)
2732                         goto again;
2733                 break;
2734         case G_RAID3_DISK_STATE_ACTIVE:
2735                 /*
2736                  * Possible scenarios:
2737                  * 1. New disk does not need synchronization.
2738                  * 2. Synchronization process finished successfully.
2739                  */
2740                 KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2741                     sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2742                     ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2743                     g_raid3_device_state2str(sc->sc_state),
2744                     g_raid3_get_diskname(disk),
2745                     g_raid3_disk_state2str(disk->d_state)));
2746                 /* Previous state should be NEW or SYNCHRONIZING. */
2747                 KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW ||
2748                     disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2749                     ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2750                     g_raid3_disk_state2str(disk->d_state)));
2751                 DISK_STATE_CHANGED();
2752
2753                 if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
2754                         disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING;
2755                         disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC;
2756                         g_raid3_sync_stop(sc, 0);
2757                 }
2758                 disk->d_state = state;
2759                 disk->d_sync.ds_offset = 0;
2760                 disk->d_sync.ds_offset_done = 0;
2761                 g_raid3_update_idle(sc, disk);
2762                 g_raid3_update_metadata(disk);
2763                 G_RAID3_DEBUG(1, "Device %s: provider %s activated.",
2764                     sc->sc_name, g_raid3_get_diskname(disk));
2765                 break;
2766         case G_RAID3_DISK_STATE_STALE:
2767                 /*
2768                  * Possible scenarios:
2769                  * 1. Stale disk was connected.
2770                  */
2771                 /* Previous state should be NEW. */
2772                 KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2773                     ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2774                     g_raid3_disk_state2str(disk->d_state)));
2775                 KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2776                     sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2777                     ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2778                     g_raid3_device_state2str(sc->sc_state),
2779                     g_raid3_get_diskname(disk),
2780                     g_raid3_disk_state2str(disk->d_state)));
2781                 /*
2782                  * STALE state is only possible if device is marked
2783                  * NOAUTOSYNC.
2784                  */
2785                 KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0,
2786                     ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2787                     g_raid3_device_state2str(sc->sc_state),
2788                     g_raid3_get_diskname(disk),
2789                     g_raid3_disk_state2str(disk->d_state)));
2790                 DISK_STATE_CHANGED();
2791
2792                 disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2793                 disk->d_state = state;
2794                 g_raid3_update_metadata(disk);
2795                 G_RAID3_DEBUG(0, "Device %s: provider %s is stale.",
2796                     sc->sc_name, g_raid3_get_diskname(disk));
2797                 break;
2798         case G_RAID3_DISK_STATE_SYNCHRONIZING:
2799                 /*
2800                  * Possible scenarios:
2801                  * 1. Disk which needs synchronization was connected.
2802                  */
2803                 /* Previous state should be NEW. */
2804                 KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2805                     ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2806                     g_raid3_disk_state2str(disk->d_state)));
2807                 KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2808                     sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2809                     ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2810                     g_raid3_device_state2str(sc->sc_state),
2811                     g_raid3_get_diskname(disk),
2812                     g_raid3_disk_state2str(disk->d_state)));
2813                 DISK_STATE_CHANGED();
2814
2815                 if (disk->d_state == G_RAID3_DISK_STATE_NEW)
2816                         disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2817                 disk->d_state = state;
2818                 if (sc->sc_provider != NULL) {
2819                         g_raid3_sync_start(sc);
2820                         g_raid3_update_metadata(disk);
2821                 }
2822                 break;
2823         case G_RAID3_DISK_STATE_DISCONNECTED:
2824                 /*
2825                  * Possible scenarios:
2826                  * 1. Device wasn't running yet, but disk disappear.
2827                  * 2. Disk was active and disapppear.
2828                  * 3. Disk disappear during synchronization process.
2829                  */
2830                 if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2831                     sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
2832                         /*
2833                          * Previous state should be ACTIVE, STALE or
2834                          * SYNCHRONIZING.
2835                          */
2836                         KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
2837                             disk->d_state == G_RAID3_DISK_STATE_STALE ||
2838                             disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2839                             ("Wrong disk state (%s, %s).",
2840                             g_raid3_get_diskname(disk),
2841                             g_raid3_disk_state2str(disk->d_state)));
2842                 } else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) {
2843                         /* Previous state should be NEW. */
2844                         KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2845                             ("Wrong disk state (%s, %s).",
2846                             g_raid3_get_diskname(disk),
2847                             g_raid3_disk_state2str(disk->d_state)));
2848                         /*
2849                          * Reset bumping syncid if disk disappeared in STARTING
2850                          * state.
2851                          */
2852                         if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0)
2853                                 sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
2854 #ifdef  INVARIANTS
2855                 } else {
2856                         KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).",
2857                             sc->sc_name,
2858                             g_raid3_device_state2str(sc->sc_state),
2859                             g_raid3_get_diskname(disk),
2860                             g_raid3_disk_state2str(disk->d_state)));
2861 #endif
2862                 }
2863                 DISK_STATE_CHANGED();
2864                 G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.",
2865                     sc->sc_name, g_raid3_get_diskname(disk));
2866
2867                 g_raid3_destroy_disk(disk);
2868                 break;
2869         default:
2870                 KASSERT(1 == 0, ("Unknown state (%u).", state));
2871                 break;
2872         }
2873         return (0);
2874 }
2875 #undef  DISK_STATE_CHANGED
2876
2877 int
2878 g_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md)
2879 {
2880         struct g_provider *pp;
2881         u_char *buf;
2882         int error;
2883
2884         g_topology_assert();
2885
2886         error = g_access(cp, 1, 0, 0);
2887         if (error != 0)
2888                 return (error);
2889         pp = cp->provider;
2890         g_topology_unlock();
2891         /* Metadata are stored on last sector. */
2892         buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
2893             &error);
2894         g_topology_lock();
2895         g_access(cp, -1, 0, 0);
2896         if (buf == NULL) {
2897                 G_RAID3_DEBUG(1, "Cannot read metadata from %s (error=%d).",
2898                     cp->provider->name, error);
2899                 return (error);
2900         }
2901
2902         /* Decode metadata. */
2903         error = raid3_metadata_decode(buf, md);
2904         g_free(buf);
2905         if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0)
2906                 return (EINVAL);
2907         if (md->md_version > G_RAID3_VERSION) {
2908                 G_RAID3_DEBUG(0,
2909                     "Kernel module is too old to handle metadata from %s.",
2910                     cp->provider->name);
2911                 return (EINVAL);
2912         }
2913         if (error != 0) {
2914                 G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.",
2915                     cp->provider->name);
2916                 return (error);
2917         }
2918         if (md->md_sectorsize > MAXPHYS) {
2919                 G_RAID3_DEBUG(0, "The blocksize is too big.");
2920                 return (EINVAL);
2921         }
2922
2923         return (0);
2924 }
2925
2926 static int
2927 g_raid3_check_metadata(struct g_raid3_softc *sc, struct g_provider *pp,
2928     struct g_raid3_metadata *md)
2929 {
2930
2931         if (md->md_no >= sc->sc_ndisks) {
2932                 G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.",
2933                     pp->name, md->md_no);
2934                 return (EINVAL);
2935         }
2936         if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) {
2937                 G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.",
2938                     pp->name, md->md_no);
2939                 return (EEXIST);
2940         }
2941         if (md->md_all != sc->sc_ndisks) {
2942                 G_RAID3_DEBUG(1,
2943                     "Invalid '%s' field on disk %s (device %s), skipping.",
2944                     "md_all", pp->name, sc->sc_name);
2945                 return (EINVAL);
2946         }
2947         if ((md->md_mediasize % md->md_sectorsize) != 0) {
2948                 G_RAID3_DEBUG(1, "Invalid metadata (mediasize %% sectorsize != "
2949                     "0) on disk %s (device %s), skipping.", pp->name,
2950                     sc->sc_name);
2951                 return (EINVAL);
2952         }
2953         if (md->md_mediasize != sc->sc_mediasize) {
2954                 G_RAID3_DEBUG(1,
2955                     "Invalid '%s' field on disk %s (device %s), skipping.",
2956                     "md_mediasize", pp->name, sc->sc_name);
2957                 return (EINVAL);
2958         }
2959         if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) {
2960                 G_RAID3_DEBUG(1,
2961                     "Invalid '%s' field on disk %s (device %s), skipping.",
2962                     "md_mediasize", pp->name, sc->sc_name);
2963                 return (EINVAL);
2964         }
2965         if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) {
2966                 G_RAID3_DEBUG(1,
2967                     "Invalid size of disk %s (device %s), skipping.", pp->name,
2968                     sc->sc_name);
2969                 return (EINVAL);
2970         }
2971         if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) {
2972                 G_RAID3_DEBUG(1,
2973                     "Invalid '%s' field on disk %s (device %s), skipping.",
2974                     "md_sectorsize", pp->name, sc->sc_name);
2975                 return (EINVAL);
2976         }
2977         if (md->md_sectorsize != sc->sc_sectorsize) {
2978                 G_RAID3_DEBUG(1,
2979                     "Invalid '%s' field on disk %s (device %s), skipping.",
2980                     "md_sectorsize", pp->name, sc->sc_name);
2981                 return (EINVAL);
2982         }
2983         if ((sc->sc_sectorsize % pp->sectorsize) != 0) {
2984                 G_RAID3_DEBUG(1,
2985                     "Invalid sector size of disk %s (device %s), skipping.",
2986                     pp->name, sc->sc_name);
2987                 return (EINVAL);
2988         }
2989         if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) {
2990                 G_RAID3_DEBUG(1,
2991                     "Invalid device flags on disk %s (device %s), skipping.",
2992                     pp->name, sc->sc_name);
2993                 return (EINVAL);
2994         }
2995         if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
2996             (md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) {
2997                 /*
2998                  * VERIFY and ROUND-ROBIN options are mutally exclusive.
2999                  */
3000                 G_RAID3_DEBUG(1, "Both VERIFY and ROUND-ROBIN flags exist on "
3001                     "disk %s (device %s), skipping.", pp->name, sc->sc_name);
3002                 return (EINVAL);
3003         }
3004         if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) {
3005                 G_RAID3_DEBUG(1,
3006                     "Invalid disk flags on disk %s (device %s), skipping.",
3007                     pp->name, sc->sc_name);
3008                 return (EINVAL);
3009         }
3010         return (0);
3011 }
3012
3013 int
3014 g_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp,
3015     struct g_raid3_metadata *md)
3016 {
3017         struct g_raid3_disk *disk;
3018         int error;
3019
3020         g_topology_assert_not();
3021         G_RAID3_DEBUG(2, "Adding disk %s.", pp->name);
3022
3023         error = g_raid3_check_metadata(sc, pp, md);
3024         if (error != 0)
3025                 return (error);
3026         if (sc->sc_state != G_RAID3_DEVICE_STATE_STARTING &&
3027             md->md_genid < sc->sc_genid) {
3028                 G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.",
3029                     pp->name, sc->sc_name);
3030                 return (EINVAL);
3031         }
3032         disk = g_raid3_init_disk(sc, pp, md, &error);
3033         if (disk == NULL)
3034                 return (error);
3035         error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW,
3036             G_RAID3_EVENT_WAIT);
3037         if (error != 0)
3038                 return (error);
3039         if (md->md_version < G_RAID3_VERSION) {
3040                 G_RAID3_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).",
3041                     pp->name, md->md_version, G_RAID3_VERSION);
3042                 g_raid3_update_metadata(disk);
3043         }
3044         return (0);
3045 }
3046
3047 static void
3048 g_raid3_destroy_delayed(void *arg, int flag)
3049 {
3050         struct g_raid3_softc *sc;
3051         int error;
3052
3053         if (flag == EV_CANCEL) {
3054                 G_RAID3_DEBUG(1, "Destroying canceled.");
3055                 return;
3056         }
3057         sc = arg;
3058         g_topology_unlock();
3059         sx_xlock(&sc->sc_lock);
3060         KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) == 0,
3061             ("DESTROY flag set on %s.", sc->sc_name));
3062         KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROYING) != 0,
3063             ("DESTROYING flag not set on %s.", sc->sc_name));
3064         G_RAID3_DEBUG(0, "Destroying %s (delayed).", sc->sc_name);
3065         error = g_raid3_destroy(sc, G_RAID3_DESTROY_SOFT);
3066         if (error != 0) {
3067                 G_RAID3_DEBUG(0, "Cannot destroy %s.", sc->sc_name);
3068                 sx_xunlock(&sc->sc_lock);
3069         }
3070         g_topology_lock();
3071 }
3072
3073 static int
3074 g_raid3_access(struct g_provider *pp, int acr, int acw, int ace)
3075 {
3076         struct g_raid3_softc *sc;
3077         int dcr, dcw, dce, error = 0;
3078
3079         g_topology_assert();
3080         G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr,
3081             acw, ace);
3082
3083         sc = pp->geom->softc;
3084         if (sc == NULL && acr <= 0 && acw <= 0 && ace <= 0)
3085                 return (0);
3086         KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name));
3087
3088         dcr = pp->acr + acr;
3089         dcw = pp->acw + acw;
3090         dce = pp->ace + ace;
3091
3092         g_topology_unlock();
3093         sx_xlock(&sc->sc_lock);
3094         if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0 ||
3095             g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1) {
3096                 if (acr > 0 || acw > 0 || ace > 0)
3097                         error = ENXIO;
3098                 goto end;
3099         }
3100         if (dcw == 0 && !sc->sc_idle)
3101                 g_raid3_idle(sc, dcw);
3102         if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROYING) != 0) {
3103                 if (acr > 0 || acw > 0 || ace > 0) {
3104                         error = ENXIO;
3105                         goto end;
3106                 }
3107                 if (dcr == 0 && dcw == 0 && dce == 0) {
3108                         g_post_event(g_raid3_destroy_delayed, sc, M_WAITOK,
3109                             sc, NULL);
3110                 }
3111         }
3112 end:
3113         sx_xunlock(&sc->sc_lock);
3114         g_topology_lock();
3115         return (error);
3116 }
3117
3118 static struct g_geom *
3119 g_raid3_create(struct g_class *mp, const struct g_raid3_metadata *md)
3120 {
3121         struct g_raid3_softc *sc;
3122         struct g_geom *gp;
3123         int error, timeout;
3124         u_int n;
3125
3126         g_topology_assert();
3127         G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id);
3128
3129         /* One disk is minimum. */
3130         if (md->md_all < 1)
3131                 return (NULL);
3132         /*
3133          * Action geom.
3134          */
3135         gp = g_new_geomf(mp, "%s", md->md_name);
3136         sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK | M_ZERO);
3137         sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3,
3138             M_WAITOK | M_ZERO);
3139         gp->start = g_raid3_start;
3140         gp->orphan = g_raid3_orphan;
3141         gp->access = g_raid3_access;
3142         gp->dumpconf = g_raid3_dumpconf;
3143
3144         sc->sc_id = md->md_id;
3145         sc->sc_mediasize = md->md_mediasize;
3146         sc->sc_sectorsize = md->md_sectorsize;
3147         sc->sc_ndisks = md->md_all;
3148         sc->sc_round_robin = 0;
3149         sc->sc_flags = md->md_mflags;
3150         sc->sc_bump_id = 0;
3151         sc->sc_idle = 1;
3152         sc->sc_last_write = time_uptime;
3153         sc->sc_writes = 0;
3154         for (n = 0; n < sc->sc_ndisks; n++) {
3155                 sc->sc_disks[n].d_softc = sc;
3156                 sc->sc_disks[n].d_no = n;
3157                 sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK;
3158         }
3159         sx_init(&sc->sc_lock, "graid3:lock");
3160         bioq_init(&sc->sc_queue);
3161         mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF);
3162         bioq_init(&sc->sc_regular_delayed);
3163         bioq_init(&sc->sc_inflight);
3164         bioq_init(&sc->sc_sync_delayed);
3165         TAILQ_INIT(&sc->sc_events);
3166         mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF);
3167         callout_init(&sc->sc_callout, CALLOUT_MPSAFE);
3168         sc->sc_state = G_RAID3_DEVICE_STATE_STARTING;
3169         gp->softc = sc;
3170         sc->sc_geom = gp;
3171         sc->sc_provider = NULL;
3172         /*
3173          * Synchronization geom.
3174          */
3175         gp = g_new_geomf(mp, "%s.sync", md->md_name);
3176         gp->softc = sc;
3177         gp->orphan = g_raid3_orphan;
3178         sc->sc_sync.ds_geom = gp;
3179
3180         if (!g_raid3_use_malloc) {
3181                 sc->sc_zones[G_RAID3_ZONE_64K].sz_zone = uma_zcreate("gr3:64k",
3182                     65536, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL,
3183                     UMA_ALIGN_PTR, 0);
3184                 sc->sc_zones[G_RAID3_ZONE_64K].sz_inuse = 0;
3185                 sc->sc_zones[G_RAID3_ZONE_64K].sz_max = g_raid3_n64k;
3186                 sc->sc_zones[G_RAID3_ZONE_64K].sz_requested =
3187                     sc->sc_zones[G_RAID3_ZONE_64K].sz_failed = 0;
3188                 sc->sc_zones[G_RAID3_ZONE_16K].sz_zone = uma_zcreate("gr3:16k",
3189                     16384, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL,
3190                     UMA_ALIGN_PTR, 0);
3191                 sc->sc_zones[G_RAID3_ZONE_16K].sz_inuse = 0;
3192                 sc->sc_zones[G_RAID3_ZONE_16K].sz_max = g_raid3_n16k;
3193                 sc->sc_zones[G_RAID3_ZONE_16K].sz_requested =
3194                     sc->sc_zones[G_RAID3_ZONE_16K].sz_failed = 0;
3195                 sc->sc_zones[G_RAID3_ZONE_4K].sz_zone = uma_zcreate("gr3:4k",
3196                     4096, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL,
3197                     UMA_ALIGN_PTR, 0);
3198                 sc->sc_zones[G_RAID3_ZONE_4K].sz_inuse = 0;
3199                 sc->sc_zones[G_RAID3_ZONE_4K].sz_max = g_raid3_n4k;
3200                 sc->sc_zones[G_RAID3_ZONE_4K].sz_requested =
3201                     sc->sc_zones[G_RAID3_ZONE_4K].sz_failed = 0;
3202         }
3203
3204         error = kproc_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0,
3205             "g_raid3 %s", md->md_name);
3206         if (error != 0) {
3207                 G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.",
3208                     sc->sc_name);
3209                 if (!g_raid3_use_malloc) {
3210                         uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone);
3211                         uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone);
3212                         uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone);
3213                 }
3214                 g_destroy_geom(sc->sc_sync.ds_geom);
3215                 mtx_destroy(&sc->sc_events_mtx);
3216                 mtx_destroy(&sc->sc_queue_mtx);
3217                 sx_destroy(&sc->sc_lock);
3218                 g_destroy_geom(sc->sc_geom);
3219                 free(sc->sc_disks, M_RAID3);
3220                 free(sc, M_RAID3);
3221                 return (NULL);
3222         }
3223
3224         G_RAID3_DEBUG(1, "Device %s created (%u components, id=%u).",
3225             sc->sc_name, sc->sc_ndisks, sc->sc_id);
3226
3227         sc->sc_rootmount = root_mount_hold("GRAID3");
3228         G_RAID3_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount);
3229
3230         /*
3231          * Run timeout.
3232          */
3233         timeout = atomic_load_acq_int(&g_raid3_timeout);
3234         callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc);
3235         return (sc->sc_geom);
3236 }
3237
3238 int
3239 g_raid3_destroy(struct g_raid3_softc *sc, int how)
3240 {
3241         struct g_provider *pp;
3242
3243         g_topology_assert_not();
3244         if (sc == NULL)
3245                 return (ENXIO);
3246         sx_assert(&sc->sc_lock, SX_XLOCKED);
3247
3248         pp = sc->sc_provider;
3249         if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
3250                 switch (how) {
3251                 case G_RAID3_DESTROY_SOFT:
3252                         G_RAID3_DEBUG(1,
3253                             "Device %s is still open (r%dw%de%d).", pp->name,
3254                             pp->acr, pp->acw, pp->ace);
3255                         return (EBUSY);
3256                 case G_RAID3_DESTROY_DELAYED:
3257                         G_RAID3_DEBUG(1,
3258                             "Device %s will be destroyed on last close.",
3259                             pp->name);
3260                         if (sc->sc_syncdisk != NULL)
3261                                 g_raid3_sync_stop(sc, 1);
3262                         sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROYING;
3263                         return (EBUSY);
3264                 case G_RAID3_DESTROY_HARD:
3265                         G_RAID3_DEBUG(1, "Device %s is still open, so it "
3266                             "can't be definitely removed.", pp->name);
3267                         break;
3268                 }
3269         }
3270
3271         g_topology_lock();
3272         if (sc->sc_geom->softc == NULL) {
3273                 g_topology_unlock();
3274                 return (0);
3275         }
3276         sc->sc_geom->softc = NULL;
3277         sc->sc_sync.ds_geom->softc = NULL;
3278         g_topology_unlock();
3279
3280         sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
3281         sc->sc_flags |= G_RAID3_DEVICE_FLAG_WAIT;
3282         G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
3283         sx_xunlock(&sc->sc_lock);
3284         mtx_lock(&sc->sc_queue_mtx);
3285         wakeup(sc);
3286         wakeup(&sc->sc_queue);
3287         mtx_unlock(&sc->sc_queue_mtx);
3288         G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker);
3289         while (sc->sc_worker != NULL)
3290                 tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5);
3291         G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker);
3292         sx_xlock(&sc->sc_lock);
3293         g_raid3_destroy_device(sc);
3294         free(sc->sc_disks, M_RAID3);
3295         free(sc, M_RAID3);
3296         return (0);
3297 }
3298
3299 static void
3300 g_raid3_taste_orphan(struct g_consumer *cp)
3301 {
3302
3303         KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
3304             cp->provider->name));
3305 }
3306
3307 static struct g_geom *
3308 g_raid3_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
3309 {
3310         struct g_raid3_metadata md;
3311         struct g_raid3_softc *sc;
3312         struct g_consumer *cp;
3313         struct g_geom *gp;
3314         int error;
3315
3316         g_topology_assert();
3317         g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
3318         G_RAID3_DEBUG(2, "Tasting %s.", pp->name);
3319
3320         gp = g_new_geomf(mp, "raid3:taste");
3321         /* This orphan function should be never called. */
3322         gp->orphan = g_raid3_taste_orphan;
3323         cp = g_new_consumer(gp);
3324         g_attach(cp, pp);
3325         error = g_raid3_read_metadata(cp, &md);
3326         g_detach(cp);
3327         g_destroy_consumer(cp);
3328         g_destroy_geom(gp);
3329         if (error != 0)
3330                 return (NULL);
3331         gp = NULL;
3332
3333         if (md.md_provider[0] != '\0' &&
3334             !g_compare_names(md.md_provider, pp->name))
3335                 return (NULL);
3336         if (md.md_provsize != 0 && md.md_provsize != pp->mediasize)
3337                 return (NULL);
3338         if (g_raid3_debug >= 2)
3339                 raid3_metadata_dump(&md);
3340
3341         /*
3342          * Let's check if device already exists.
3343          */
3344         sc = NULL;
3345         LIST_FOREACH(gp, &mp->geom, geom) {
3346                 sc = gp->softc;
3347                 if (sc == NULL)
3348                         continue;
3349                 if (sc->sc_sync.ds_geom == gp)
3350                         continue;
3351                 if (strcmp(md.md_name, sc->sc_name) != 0)
3352                         continue;
3353                 if (md.md_id != sc->sc_id) {
3354                         G_RAID3_DEBUG(0, "Device %s already configured.",
3355                             sc->sc_name);
3356                         return (NULL);
3357                 }
3358                 break;
3359         }
3360         if (gp == NULL) {
3361                 gp = g_raid3_create(mp, &md);
3362                 if (gp == NULL) {
3363                         G_RAID3_DEBUG(0, "Cannot create device %s.",
3364                             md.md_name);
3365                         return (NULL);
3366                 }
3367                 sc = gp->softc;
3368         }
3369         G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
3370         g_topology_unlock();
3371         sx_xlock(&sc->sc_lock);
3372         error = g_raid3_add_disk(sc, pp, &md);
3373         if (error != 0) {
3374                 G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
3375                     pp->name, gp->name, error);
3376                 if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) ==
3377                     sc->sc_ndisks) {
3378                         g_cancel_event(sc);
3379                         g_raid3_destroy(sc, G_RAID3_DESTROY_HARD);
3380                         g_topology_lock();
3381                         return (NULL);
3382                 }
3383                 gp = NULL;
3384         }
3385         sx_xunlock(&sc->sc_lock);
3386         g_topology_lock();
3387         return (gp);
3388 }
3389
3390 static int
3391 g_raid3_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused,
3392     struct g_geom *gp)
3393 {
3394         struct g_raid3_softc *sc;
3395         int error;
3396
3397         g_topology_unlock();
3398         sc = gp->softc;
3399         sx_xlock(&sc->sc_lock);
3400         g_cancel_event(sc);
3401         error = g_raid3_destroy(gp->softc, G_RAID3_DESTROY_SOFT);
3402         if (error != 0)
3403                 sx_xunlock(&sc->sc_lock);
3404         g_topology_lock();
3405         return (error);
3406 }
3407
3408 static void
3409 g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
3410     struct g_consumer *cp, struct g_provider *pp)
3411 {
3412         struct g_raid3_softc *sc;
3413
3414         g_topology_assert();
3415
3416         sc = gp->softc;
3417         if (sc == NULL)
3418                 return;
3419         /* Skip synchronization geom. */
3420         if (gp == sc->sc_sync.ds_geom)
3421                 return;
3422         if (pp != NULL) {
3423                 /* Nothing here. */
3424         } else if (cp != NULL) {
3425                 struct g_raid3_disk *disk;
3426
3427                 disk = cp->private;
3428                 if (disk == NULL)
3429                         return;
3430                 g_topology_unlock();
3431                 sx_xlock(&sc->sc_lock);
3432                 sbuf_printf(sb, "%s<Type>", indent);
3433                 if (disk->d_no == sc->sc_ndisks - 1)
3434                         sbuf_printf(sb, "PARITY");
3435                 else
3436                         sbuf_printf(sb, "DATA");
3437                 sbuf_printf(sb, "</Type>\n");
3438                 sbuf_printf(sb, "%s<Number>%u</Number>\n", indent,
3439                     (u_int)disk->d_no);
3440                 if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
3441                         sbuf_printf(sb, "%s<Synchronized>", indent);
3442                         if (disk->d_sync.ds_offset == 0)
3443                                 sbuf_printf(sb, "0%%");
3444                         else {
3445                                 sbuf_printf(sb, "%u%%",
3446                                     (u_int)((disk->d_sync.ds_offset * 100) /
3447                                     (sc->sc_mediasize / (sc->sc_ndisks - 1))));
3448                         }
3449                         sbuf_printf(sb, "</Synchronized>\n");
3450                 }
3451                 sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent,
3452                     disk->d_sync.ds_syncid);
3453                 sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, disk->d_genid);
3454                 sbuf_printf(sb, "%s<Flags>", indent);
3455                 if (disk->d_flags == 0)
3456                         sbuf_printf(sb, "NONE");
3457                 else {
3458                         int first = 1;
3459
3460 #define ADD_FLAG(flag, name)    do {                                    \
3461         if ((disk->d_flags & (flag)) != 0) {                            \
3462                 if (!first)                                             \
3463                         sbuf_printf(sb, ", ");                          \
3464                 else                                                    \
3465                         first = 0;                                      \
3466                 sbuf_printf(sb, name);                                  \
3467         }                                                               \
3468 } while (0)
3469                         ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY");
3470                         ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED");
3471                         ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING,
3472                             "SYNCHRONIZING");
3473                         ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC");
3474                         ADD_FLAG(G_RAID3_DISK_FLAG_BROKEN, "BROKEN");
3475 #undef  ADD_FLAG
3476                 }
3477                 sbuf_printf(sb, "</Flags>\n");
3478                 sbuf_printf(sb, "%s<State>%s</State>\n", indent,
3479                     g_raid3_disk_state2str(disk->d_state));
3480                 sx_xunlock(&sc->sc_lock);
3481                 g_topology_lock();
3482         } else {
3483                 g_topology_unlock();
3484                 sx_xlock(&sc->sc_lock);
3485                 if (!g_raid3_use_malloc) {
3486                         sbuf_printf(sb,
3487                             "%s<Zone4kRequested>%u</Zone4kRequested>\n", indent,
3488                             sc->sc_zones[G_RAID3_ZONE_4K].sz_requested);
3489                         sbuf_printf(sb,
3490                             "%s<Zone4kFailed>%u</Zone4kFailed>\n", indent,
3491                             sc->sc_zones[G_RAID3_ZONE_4K].sz_failed);
3492                         sbuf_printf(sb,
3493                             "%s<Zone16kRequested>%u</Zone16kRequested>\n", indent,
3494                             sc->sc_zones[G_RAID3_ZONE_16K].sz_requested);
3495                         sbuf_printf(sb,
3496                             "%s<Zone16kFailed>%u</Zone16kFailed>\n", indent,
3497                             sc->sc_zones[G_RAID3_ZONE_16K].sz_failed);
3498                         sbuf_printf(sb,
3499                             "%s<Zone64kRequested>%u</Zone64kRequested>\n", indent,
3500                             sc->sc_zones[G_RAID3_ZONE_64K].sz_requested);
3501                         sbuf_printf(sb,
3502                             "%s<Zone64kFailed>%u</Zone64kFailed>\n", indent,
3503                             sc->sc_zones[G_RAID3_ZONE_64K].sz_failed);
3504                 }
3505                 sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
3506                 sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, sc->sc_syncid);
3507                 sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, sc->sc_genid);
3508                 sbuf_printf(sb, "%s<Flags>", indent);
3509                 if (sc->sc_flags == 0)
3510                         sbuf_printf(sb, "NONE");
3511                 else {
3512                         int first = 1;
3513
3514 #define ADD_FLAG(flag, name)    do {                                    \
3515         if ((sc->sc_flags & (flag)) != 0) {                             \
3516                 if (!first)                                             \
3517                         sbuf_printf(sb, ", ");                          \
3518                 else                                                    \
3519                         first = 0;                                      \
3520                 sbuf_printf(sb, name);                                  \
3521         }                                                               \
3522 } while (0)
3523                         ADD_FLAG(G_RAID3_DEVICE_FLAG_NOFAILSYNC, "NOFAILSYNC");
3524                         ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC");
3525                         ADD_FLAG(G_RAID3_DEVICE_FLAG_ROUND_ROBIN,
3526                             "ROUND-ROBIN");
3527                         ADD_FLAG(G_RAID3_DEVICE_FLAG_VERIFY, "VERIFY");
3528 #undef  ADD_FLAG
3529                 }
3530                 sbuf_printf(sb, "</Flags>\n");
3531                 sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
3532                     sc->sc_ndisks);
3533                 sbuf_printf(sb, "%s<State>%s</State>\n", indent,
3534                     g_raid3_device_state2str(sc->sc_state));
3535                 sx_xunlock(&sc->sc_lock);
3536                 g_topology_lock();
3537         }
3538 }
3539
3540 static void
3541 g_raid3_shutdown_pre_sync(void *arg, int howto)
3542 {
3543         struct g_class *mp;
3544         struct g_geom *gp, *gp2;
3545         struct g_raid3_softc *sc;
3546         int error;
3547
3548         mp = arg;
3549         DROP_GIANT();
3550         g_topology_lock();
3551         LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
3552                 if ((sc = gp->softc) == NULL)
3553                         continue;
3554                 /* Skip synchronization geom. */
3555                 if (gp == sc->sc_sync.ds_geom)
3556                         continue;
3557                 g_topology_unlock();
3558                 sx_xlock(&sc->sc_lock);
3559                 g_cancel_event(sc);
3560                 error = g_raid3_destroy(sc, G_RAID3_DESTROY_DELAYED);
3561                 if (error != 0)
3562                         sx_xunlock(&sc->sc_lock);
3563                 g_topology_lock();
3564         }
3565         g_topology_unlock();
3566         PICKUP_GIANT();
3567 }
3568
3569 static void
3570 g_raid3_init(struct g_class *mp)
3571 {
3572
3573         g_raid3_pre_sync = EVENTHANDLER_REGISTER(shutdown_pre_sync,
3574             g_raid3_shutdown_pre_sync, mp, SHUTDOWN_PRI_FIRST);
3575         if (g_raid3_pre_sync == NULL)
3576                 G_RAID3_DEBUG(0, "Warning! Cannot register shutdown event.");
3577 }
3578
3579 static void
3580 g_raid3_fini(struct g_class *mp)
3581 {
3582
3583         if (g_raid3_pre_sync != NULL)
3584                 EVENTHANDLER_DEREGISTER(shutdown_pre_sync, g_raid3_pre_sync);
3585 }
3586
3587 DECLARE_GEOM_CLASS(g_raid3_class, g_raid3);